<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JA</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Aging</journal-id>
      <journal-title>JMIR Aging</journal-title>
      <issn pub-type="epub">2561-7605</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e80102</article-id>
      <article-id pub-id-type="pmid">41926761</article-id>
      <article-id pub-id-type="doi">10.2196/80102</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Integrating Care Context With Skeleton and Depth Information for Older Adult Activity Recognition in a Care Facility Using Care-Assessment-Aware Spatiotemporal Transformer: Method and Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Jiang</surname>
            <given-names>Yun</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sun</surname>
            <given-names>Han</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Nahid</surname>
            <given-names>Nazmun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Kyushu Institute of Technology</institution>
            <addr-line>2-4 Hibikino, Wakamatsu Ward</addr-line>
            <addr-line>Kitakyushu, 808-0135</addr-line>
            <country>Japan</country>
            <phone>81 08069989264</phone>
            <email>raian.nahid@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1037-5485</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hassan</surname>
            <given-names>Iqbal</given-names>
          </name>
          <degrees>MEng</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-8679-0536</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Ahad</surname>
            <given-names>Md Atiqur Rahman</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8355-7004</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Inoue</surname>
            <given-names>Sozo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1109-8130</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Kyushu Institute of Technology</institution>
        <addr-line>Kitakyushu</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>University of East London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Nazmun Nahid <email>raian.nahid@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>4</month>
        <year>2026</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e80102</elocation-id>
      <history>
        <date date-type="received">
          <day>4</day>
          <month>7</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>1</day>
          <month>8</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>10</day>
          <month>12</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>16</day>
          <month>12</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Nazmun Nahid, Iqbal Hassan, Md Atiqur Rahman Ahad, Sozo Inoue. Originally published in JMIR Aging (https://aging.jmir.org), 02.04.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Aging, is properly cited. The complete bibliographic information, a link to the original publication on https://aging.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://aging.jmir.org/2026/1/e80102" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Older adult activity recognition is a critical task in long-term care monitoring; yet, it remains challenging due to postural deformities and health-related variability. These factors cause different activities to appear visually similar, or the same activity to appear dissimilar, undermining the effectiveness of traditional human activity recognition models developed for the general population.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to develop an improved older adult activity recognition method that integrates care assessment information with motion data to capture and understand movement variability arising from different health conditions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>To achieve our objective, we propose a care-assessment-aware spatiotemporal transformer (CSTT) model that integrates body key points, heatmaps, and care level data for personalized and context-aware activity recognition. The model dynamically adjusts its attention mechanism based on care level context to improve recognition accuracy. CSTT was trained and validated on real-world older adult motion data. A total of 51 older adult participants (30 men and 21 women; age range of 64-95 years) were included in the study. Among them, 7 (13.7%) required high care assistance, 26 (51.0%) required medium care assistance, and 18 (35.3%) required low care assistance.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Despite data imbalance and considerable intraclass variation due to differing care needs, the proposed CSTT model achieved an <italic>F</italic><sub>1</sub>-score and accuracy of 0.96 and area under the curve is 0.98. Analysis revealed that movement patterns differ significantly across care levels and that similar motions occur in distinct activities, highlighting the importance of care-aware modeling.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Incorporating care level information into activity recognition models significantly enhances performance in older adult care settings. The proposed CSTT framework demonstrates the value of personalized, context-sensitive approaches for accurate and ethical monitoring in long-term care environments.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>older adult  activity recognition</kwd>
        <kwd>activity recognition</kwd>
        <kwd>care data</kwd>
        <kwd>older adult dataset, transformer.</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Advancements in medical science and technology have led to a global demographic shift, with the older adult population projected to reach 1.2 billion by 2025 and 2 billion by 2050 [<xref ref-type="bibr" rid="ref1">1</xref>]. By 2050, nearly 20% of the world’s population will be older adults [<xref ref-type="bibr" rid="ref2">2</xref>]. As life expectancy increases, age-related physical and cognitive decline necessitates long-term care (LTC) [<xref ref-type="bibr" rid="ref3">3</xref>], placing significant strain on health care systems and LTC facilities [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>Caregivers in LTC facilities perform essential tasks, including hygiene assistance, feeding, dressing, and mobility support [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], while also providing emotional and psychological care [<xref ref-type="bibr" rid="ref11">11</xref>]. However, their close interaction with residents exposes them to emotionally taxing experiences, such as witnessing chronic pain, cognitive decline, and end-of-life care [<xref ref-type="bibr" rid="ref12">12</xref>], leading to compassion fatigue and emotional exhaustion. Caregivers also navigate ethical dilemmas, balancing residents’ autonomy with safety, managing conflicts with families, and making complex decisions [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. These challenges heighten stress and contribute to burnout [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], exacerbating the global caregiver shortage, particularly in Central Asia and Eastern Europe. High annual turnover rates (19%-55%) [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>] worsen staff shortages, increase workloads, and reduce care quality [<xref ref-type="bibr" rid="ref22">22</xref>]. To address shortages, LTC facilities increasingly rely on health care assistants and auxiliary nurses [<xref ref-type="bibr" rid="ref23">23</xref>]. While vital, their limited training can hinder the care of residents with cognitive impairments or chronic conditions [<xref ref-type="bibr" rid="ref24">24</xref>], affecting both caregiver well-being and overall health care sustainability. Ensuring caregiver well-being while maintaining high-quality older adult care is essential, as excessive strain can impact both mental health and service quality. Automatic monitoring systems offer a promising solution; however, their real-world implementation requires a robust human activity recognition (HAR) framework that accounts for older adult–specific mobility and health variations. Existing HAR methods mostly use RGB or RGB-D camera [<xref ref-type="bibr" rid="ref25">25</xref>] and inertial measurement units [<xref ref-type="bibr" rid="ref26">26</xref>]. Wearable inertial measurement units, although effective, can be intrusive and uncomfortable. In contrast, depth sensors and RGB cameras provide a nonintrusive, cost-effective alternative, advancing HAR through wireless sensor networks and the Internet of Things.</p>
        <p>HAR approaches typically rely on RGB-based [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>] or skeleton-based [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>] methods, which perform well in general settings but degrade significantly for older adult individuals due to posture and mobility differences influenced by health conditions. A key limitation of traditional HAR models is the lack of care assessment scores (CASs), which are critical in older adult care. Caregivers assign CASs such as the activities of daily living score, Barthel Index, and care level (CL) to assess mobility, physical dependency, and assistance needs. These scores provide vital insights into an individual’s functional abilities, yet state-of-the-art HAR methods fail to incorporate them, limiting their applicability to older adult populations. Since posture and motion patterns are directly affected by health conditions, neglecting these factors leads to poor generalization. Moreover, widely used HAR datasets such as RGBD-HuDaAct [<xref ref-type="bibr" rid="ref33">33</xref>], 3D Action Pairs [<xref ref-type="bibr" rid="ref34">34</xref>], MSR-DailyActivity [<xref ref-type="bibr" rid="ref35">35</xref>], NTU RGB+D 120 [<xref ref-type="bibr" rid="ref36">36</xref>], and UTD-MHAD [<xref ref-type="bibr" rid="ref37">37</xref>] lack older adult–specific data, as they primarily feature younger or healthier participants. This demographic gap hinders HAR models from accurately recognizing older adult activities. To address this challenge, integrating CASs into HAR models is essential for personalized, context-aware monitoring, enabling more adaptive and accurate older adult activity recognition.</p>
      </sec>
      <sec>
        <title>Related Works</title>
        <p>Traditional HAR methods struggle with older adult individuals, leading to research on tailored models and datasets. However, challenges remain. This section reviews state-of-the-art HAR and older adult HAR methods using skeleton and video data, together with older adult–specific datasets.</p>
      </sec>
      <sec>
        <title>Older Adult Activity Recognition</title>
        <p>RGB-based HAR is widely used, particularly for monitoring applications. It primarily extracts motion information from video frames and can be classified into 2 categories: 2-stream networks and 3D convolutional networks. Two-stream networks leverage RGB data for spatial representation and optical flow for temporal dynamics [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]. However, computing optical flow is computationally expensive, creating bottlenecks in real-time applications. In contrast, 3D convolutional networks [<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref48">48</xref>] aim to capture spatiotemporal features directly from video sequences but face challenges related to occlusions, camera motion, and environmental complexities [<xref ref-type="bibr" rid="ref49">49</xref>]. These limitations are even more pronounced in older adult activity recognition, where subtle motion variations and fine-grained details play a crucial role. Skeleton-based approaches offer a compact and robust representation of human motion by focusing on skeletal joints and their temporal evolution. Early works used spatial graph-based models [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref53">53</xref>], while later studies introduced temporal relationships through recurrent and convolutional architectures [<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref58">58</xref>]. However, skeleton-based methods lack environmental context, limiting their effectiveness in recognizing human-object interactions—an essential factor for assessing functional abilities in older adult individuals. To address the limitations of unimodal methods, researchers have explored multimodal fusion techniques that integrate RGB and skeleton data [<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]. These methods typically extract features from each modality independently before performing fusion, with some incorporating contextual information such as human-object interactions and location data [<xref ref-type="bibr" rid="ref61">61</xref>-<xref ref-type="bibr" rid="ref63">63</xref>]. Despite improved recognition accuracy, existing multimodal approaches struggle with effective feature aggregation, as irrelevant modality-specific information can degrade overall performance. More importantly, these methods are predominantly trained on young, healthy individuals and lack adaptations tailored for older adult populations. A notable older adult HAR system was introduced in the study by Kim et al [<xref ref-type="bibr" rid="ref64">64</xref>], using depth video and skeleton joint features. While effective in controlled environments, this approach failed to generalize well to real-world scenarios due to the exclusion of contextual and health-related mobility variations and the lack of real-world continuous activity patterns in the training data. Another older adult HAR method proposed a feature fusion model combining handcrafted and deep-learned features using a dedicated dataset [<xref ref-type="bibr" rid="ref65">65</xref>]. However, this approach remains limited by its reliance on a homogeneous environment and its inability to capture long-range dependencies.</p>
        <p>To overcome the limitations of existing methods, we propose a cross-modal and personalized HAR approach that considers older adult health conditions. Our base model is built on transformers, which have demonstrated exceptional performance in HAR due to their ability to model long-range dependencies through self-attention mechanisms. Most transformer-based HAR methods process RGB frames as input tokens [<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref67">67</xref>] or, less commonly, skeleton data [<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]. However, these approaches suffer from high computational costs, restricting their applicability in real-time older adult care environments. In addition, existing transformer-based models do not efficiently integrate cross-modal information, limiting their ability to leverage multimodal dependencies for older adult HAR. To address these challenges, we propose a depth map-based approach instead of RGB to reduce computational costs. We have designed a care-aware attention mechanism (CAM) and incorporated it into the spatial layer, replacing the standard self-attention mechanism in transformers. This effectively facilitates cross-modal analysis by integrating care assessment information, skeleton data, and depth images, thereby enhancing older adult HAR performance.</p>
      </sec>
      <sec>
        <title>Older Adult Dataset</title>
        <p>Although most benchmark HAR datasets are designed for the general population, some datasets have been specifically collected to study older adult individuals. EGOFALLS [<xref ref-type="bibr" rid="ref70">70</xref>] focuses on fall detection using egocentric camera data, containing 10,948 video samples from 14 participants, including 12 young adults and only 2 older adult individuals. Despite its large size, the dataset has a significantly small sample of older adult participants. Additionally, its controlled environment and lack of health condition–based variations make it unsuitable for real-world application training. The Toyota Smarthome Dataset [<xref ref-type="bibr" rid="ref71">71</xref>] captures daily living activities using 7 Kinect sensors from 18 volunteers aged 60-80 years over 8 hours in a controlled apartment setting. While it includes interactions with household objects, its limitations stem from the controlled environment and the absence of health condition–based variations. IntelliRehabDS [<xref ref-type="bibr" rid="ref72">72</xref>] was collected using a Kinect motion sensor and comprises 9 repetitive gestures performed by 29 individuals, including 15 patients and 14 healthy controls. It provides 3D body joint coordinates and depth maps, annotated for gesture type and position (sitting or standing). However, the dataset has a narrow older adult age range (20-60+ years), is collected in controlled conditions, and lacks health condition–specific data. ETRI-Activity3D [<xref ref-type="bibr" rid="ref73">73</xref>] is a large-scale dataset that includes RGB videos, depth maps, and skeleton sequences from 100 participants—50 older adult individuals (aged 64-88 years, with an average age of 77 years) and 50 younger adults (average age of 23 years). Despite its scale, the dataset has limitations, including a lack of intervention data, health condition–based variations, and continuous activity recordings. Overall, existing older adult HAR datasets suffer from small and homogeneous older adult sample sizes, controlled environment constraints, and limited continuous activity data, often lacking health condition–specific variations. To address these issues, we collected our dataset in a real care facility without intervention or manipulation, capturing mealtime sessions of 28 older adult participants aged 62-95 years, representing 5 distinct CLs.</p>
      </sec>
      <sec>
        <title>Objective and Contributions</title>
        <p>In this work, our objective is to develop an improved older adult activity recognition method that integrates care assessment information with motion data to effectively capture and understand movement variability caused by different health conditions. To achieve this, we incorporated CL, one of the most widely used CASs in older adult care facilities, due to its availability and relevance in evaluating functional abilities, and introduced a care-assessment-aware spatiotemporal transformer (CSTT) that adapts its attention to key points and depth-based motion patterns based on an individual’s CL, enabling personalized feature prioritization and improved activity prediction. We also addressed the issue of the lack of suitable data and collected a real-world older adult activity dataset incorporating CL information. The key contributions of our work can be summarized as follows:</p>
        <list list-type="order">
          <list-item>
            <p>We propose the first care-assessment-aware activity recognition approach by modeling the correlation between health conditions using CL and movement patterns to personalize and improve older adult activity recognition.</p>
          </list-item>
          <list-item>
            <p>We propose CSTT, integrating skeleton, depth heatmaps, and CL information. Our proposed CAM dynamically adjusts focus based on care needs, ensuring personalized recognition while enhancing robustness and efficiency.</p>
          </list-item>
          <list-item>
            <p>We present the first motion dataset with CL information, capturing real-world mealtime sessions to reflect aging and health impacts on mobility with high ecological validity.</p>
          </list-item>
          <list-item>
            <p>This work improves older adult activity recognition by integrating care assessment information and capturing motion variations influenced by these conditions for improved accuracy.</p>
          </list-item>
        </list>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>In this work, we tackled the challenge of older adult activity recognition, which is complicated by pose deformities and motion limitations in older adults. To address these issues, we proposed a heterogeneous spatiotemporal motion transformer with CAM, specifically designed for recognizing older adult activities. For model training, we collected a dataset with informed consent and ethical permission, <inline-graphic xlink:href="aging_v9i1e80102_fig26.png" xlink:type="simple" mimetype="image"/> where, <italic>X<sub>i</sub></italic> consists of body key points, depth images, and CL information, while <italic>y<sub>i</sub></italic> represents the corresponding activity label, and <italic>N</italic> denotes the number of training samples. The objective is to learn a function <italic>f</italic>: <italic>X</italic> → <italic>y</italic> that accurately classifies activities. In this section, we provide a detailed explanation of our data processing, followed by an in-depth discussion of the CSTT transformer. The overall architecture is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Care-assessment-aware spatiotemporal transformer architecture. This transformer enhances older adult activity recognition by integrating joint skeleton data, depth heatmaps, and CL information. The Spatial Transformer Block uses a care-aware attention mechanism, where the care level acts as a query to capture spatial relationships. The Temporal Transformer Block uses multihead self-attention to model temporal dependencies across frames. Finally, the Classification Layer processes the learned representations to predict activity labels using a fully connected network and softmax activation.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Collection and Validation Method</title>
        <sec>
          <title>Overview</title>
          <p>The dataset was gathered in partnership with Global Care, a care facility in Japan, dedicated to supporting patients with dementia, which is equipped with an in-house video-monitoring system. The camera used for the monitoring is the AXIS M3048-P [<xref ref-type="bibr" rid="ref74">74</xref>], a cost-effective fixed dome fish-eye camera featuring a 12-megapixel sensor. This camera is designed to provide a comprehensive 360° view of the surroundings, with distortion-corrected display options such as panoramic views, specific areas, corridors, corners, and quad displays, all offering exceptional sharpness. Additionally, the camera comes prefocused, eliminating the need for manual adjustments. To maintain the authenticity of the care environment, we chose not to incorporate any additional sensors. In collaboration with the care facilities, we accessed their video recordings, which specifically captured activities of 51 older adult people. For dataset 1, data was collected from 28 elderly during lunch mealtime from 3 different sites over a span of 15 days. Each recorded session lasted between 30 and 60 minutes. Older adult participants’ ages are ranging from 64 to 95 (mean 79.5) years. The placement of each participant was predetermined and managed by the staff of the care facility. To maintain the facility’s natural workflow, our team provided no instructions or interventions. In site 2, as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, patient 7 was assigned multiple positions, as their location was occasionally adjusted by the staff during certain sessions. This variation in positioning was carefully considered during the data processing phase to ensure the accuracy of the labels. For dataset 2, the other 23 participants’ data were collected from open-gathering space of dining and living area in site 4 from 2 different floor. Participants are aged between 71 and 92 (mean 81.5) years. The data were collected over 15 days period through continuous 24 hours monitoring. The site layouts are given in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <p>Due to the limitations in posture and mobility, the activities of older adult individuals differ significantly from those of healthy individuals. Similar activities may present with different postures depending on CL, while different activities might appear with similar postures across different CL groups. As a result, CL may infer crucial insights for older adult activity recognition. To address this, we have collected each older adult person’s CL alongside the video data. The score was provided by the medical professionals based on assistance requirements in meal, bath, excretion, movement, and dress-up. These levels, ranging from 1 (minimal assistance) to 5 (maximum assistance), determine the extent of supervision and support required [<xref ref-type="bibr" rid="ref75">75</xref>]. <xref ref-type="table" rid="table1">Table 1</xref> shows the explanation regarding the CL. For easier understanding of the readers, we have categorized the CLs based on assistance requirements into low, mid, and high.</p>
          <p>From the recorded videos, skeleton data and depth images were extracted for analysis. For the purpose of this study, only skeleton data, depth images, and care-related information were used, while RGB images were excluded from the analysis. In <xref ref-type="table" rid="table2">Table 2</xref>, baseline comparison is shown between the 2 collected dataset.</p>
          <p>Comparing the components between the 2 datasets in <xref ref-type="table" rid="table2">Table 2</xref>, we found that gender, care assistance requirement, and data collection time are significantly different. The age does not have much variation because the target group of our research is 60 above to 100.</p>
          <p>To highlight the necessity of incorporating care context with motion data for activity recognition, we used 2 similarity comparison approaches. The first involves calculating pairwise similarity among different CL groups, while the second focuses on comparing the pairwise similarity between low and medium CL groups for various activities. To compute similarity scores, we used mean per joint angle difference (MPJAD), cosine similarity (CS), and histogram of oriented gradients (HOG) similarity. Detailed explanations of the score calculation methods are given in the following sections:</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Model performance evaluation: the receiver operating characteristic curve illustrates care-assessment-aware spatiotemporal transformer’s prediction accuracy across different activities (class 0: sitting, class 1: eating, class 2: stand up, and class 3: trying to stand up). ROC: receiver operating characteristic curve.</p>
            </caption>
            <graphic xlink:href="aging_v9i1e80102_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Care-level interpretation for the older adult individuals.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="100"/>
              <col width="100"/>
              <col width="800"/>
              <thead>
                <tr valign="top">
                  <td>AR<sup>a</sup></td>
                  <td>CLs<sup>b</sup></td>
                  <td>Older adult condition</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Low</td>
                  <td>1 and 2</td>
                  <td>Older adult individuals in this group can move independently or with minimal aid. They sometimes require monitoring and reminders for meals, excretion, and baths. For dress-up, they sometimes need monitoring or partial assistance. They generally maintain an upright posture, with only minor stooping due to age-related spinal degeneration.</td>
                </tr>
                <tr valign="top">
                  <td>Mid</td>
                  <td>3</td>
                  <td>Older adult individuals at this stage require full support from caregivers for movement; however, they can stand up with the support of assistive devices. They require monitoring, reminders, and partial assistance for meals, excretion, and baths. For dress-up, they need full assistance. Posture is often characterized by forward leaning or hunching due to weakened core muscles and joint instability. Sitting may involve slumping as maintaining an upright position becomes difficult.</td>
                </tr>
                <tr valign="top">
                  <td>High</td>
                  <td>4 and 5</td>
                  <td>Older adult individuals at this stage require full support from caregivers for everything. They are predominantly immobile. Postural control is severely compromised. No standard posture can be seen.</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>AR: Assistance requirement.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>CLs: care levels.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Baseline comparison between the datasets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="230"/>
              <col width="250"/>
              <col width="250"/>
              <col width="240"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Components</td>
                  <td>Dataset 1</td>
                  <td>Dataset 2</td>
                  <td><italic>P</italic> value</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="2">Age (years), mean (SD)</td>
                  <td>79.5 (8.95)</td>
                  <td>81.5 (6.06)</td>
                  <td>.35</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Gender</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Men</td>
                  <td>10</td>
                  <td>20</td>
                  <td>&#60;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Women</td>
                  <td>18</td>
                  <td>3</td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Care assistance requirement</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>High</td>
                  <td>6</td>
                  <td>1</td>
                  <td>&#60;.01</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Medium</td>
                  <td>8</td>
                  <td>18</td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Low</td>
                  <td>14</td>
                  <td>4</td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Data collection time (minute)</td>
                  <td>750</td>
                  <td>21,600</td>
                  <td>&#60;.001</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Mean per Joint Angle Difference</title>
          <p>MPJAD measures the average angular difference between 2 motion sequences across all frames and joints. Given 2 motion sequences and , the MPJAD is computed as:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(1)</p>
          <p>Where <italic>P</italic> is the set of all motion sequence pairs, <italic>M</italic> is the number of motion sequence pairs, <italic>T<sub>ij</sub></italic> is the number of frames common between sequences <italic>S<sub>i</sub></italic> and <italic>S<sub>j</sub></italic>, <italic>N</italic> is the total number of joints (17 in this case), <inline-graphic xlink:href="aging_v9i1e80102_fig27.png" xlink:type="simple" mimetype="image"/> is the joint angle at joint <italic>k</italic> and frame <italic>t</italic> in sequence <italic>S<sub>i</sub></italic>, and <inline-graphic xlink:href="aging_v9i1e80102_fig28.png" xlink:type="simple" mimetype="image"/> is the absolute difference in joint angles. A lower MPJAD value indicates a higher similarity between motion sequences. The 2 motions follow similar joint movement patterns. A higher MPJAD value indicates greater dissimilarity between motion sequences. The 2 motions have significantly different joint movements.</p>
        </sec>
        <sec>
          <title>Cosine Similarity</title>
          <p>CS measures the angle between 2 vectors in a multidimensional space. Given 2 motion sequences <italic>S<sub>i</sub></italic> and <italic>S<sub>j</sub></italic>, the CS is computed by following the steps:</p>
          <p>Derive the cosine dissimilarity (used to measure homogeneity):</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(2)</p>
          <p>Where <italic>S<sub>i</sub></italic> and <italic>S<sub>j</sub></italic> are the flattened motion sequences represented as vectors. <italic>S<sub>i</sub></italic> · <italic>S<sub>j</sub></italic> is the dot product of the 2 vectors. &#124;<italic>S<sub>i</sub></italic>&#124; and &#124;<italic>S<sub>j</sub></italic>&#124; are the Euclidean norms (magnitudes). Compute the mean pairwise cosine dissimilarity within the class:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(3)</p>
          <p>Where <italic>N</italic> is the total number of unique pairs (<italic>i,j</italic>) within the class. Lower values of cosine dissimilarity indicate higher homogeneity (motion vectors within the class are closely aligned). Higher values of cosine dissimilarity indicate lower homogeneity (motion vectors within the class are more divergent).</p>
        </sec>
        <sec>
          <title>HOG Similarity</title>
          <p>HOG measures how structurally similar 2 images are based on their gradient information. Once the HOG feature vectors <italic>H<sub>1</sub></italic> and <italic>H<sub>2</sub></italic> are extracted for 2 images, their similarity is computed using the CS formula:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(4)</p>
          <p>Where <italic>H<sub>1</sub></italic> · <italic>H<sub>2</sub></italic> is the dot product of the 2 HOG feature vectors. &#124;<italic>H<sub>1</sub></italic>&#124; and &#124;<italic>H<sub>2</sub></italic>&#124; are the Euclidean norms of the vectors. If the HOG score is 1, it means the images have nearly identical edge and gradient structures, and if it is 0, it means the images have no similarity in gradient structure.</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This work involved human participants or animals in its research. Approval of all ethical and experimental procedures and protocols was granted by the research ethics board at Kyushu Institute of Technology (application no. 24-15). No participant compensation was provided. The data are not publicly available but they would be shared with upon the collaboration request only for research purpose to the applicants who adhere to the ethical and privacy policy of the ethical committee after careful consideration.</p>
      </sec>
      <sec>
        <title>Skeleton Data Generation</title>
        <p>To ensure robustness, we used skeleton data in our study. To get the skeleton data we used YOLOv7 [<xref ref-type="bibr" rid="ref76">76</xref>], a state-of-the-art real-time object detection framework that builds upon the foundational YOLO (You Only Look Once) [<xref ref-type="bibr" rid="ref77">77</xref>] family, offering enhanced performance in terms of accuracy. It maintains high precision in both detection and key point localization, and the architecture can handle multiple scales and dense environments effectively. The input is an RGB image, <italic>I</italic><sub>RGB</sub>∈R<italic><sup>H</sup></italic><sup>×</sup><italic><sup>W</sup></italic><sup>×3</sup> where <italic>H</italic> is the height of the image, <italic>W</italic> is the width of the image, and 3 represents the number of color channels (ie, red, green, and blue). YOLOv7 identifies bounding boxes for human figures in the input RGB image. Each bounding box <italic>B<sub>k</sub></italic> is represented as <italic>B<sub>k</sub></italic> = (<italic>x</italic><sub>min</sub>,<italic>y</italic><sub>min</sub>,<italic>x</italic><sub>max</sub>,<italic>y</italic><sub>max</sub>,<italic>c<sub>k</sub></italic>), where (<italic>x</italic><sub>min</sub>,<italic>y</italic><sub>min</sub>) and (<italic>x</italic><sub>max</sub>,<italic>y</italic><sub>max</sub>) define the corners of the box and <italic>c<sub>k</sub></italic> is the confidence score. Within each bounding box, the model then predicts the locations and confidence scores for the 17 key points <italic>X</italic> = {<italic>X</italic><sub>1</sub>,<italic>X</italic><sub>2</sub>,….,<italic>X</italic><sub>17</sub>}, where each key point <italic>X<sub>i</sub></italic> is defined as <italic>X<sub>i</sub></italic> = (<italic>x<sub>i</sub></italic>,<italic>y<sub>i</sub></italic>,<italic>c<sub>i</sub></italic>), i∈{1,2,…,17}. Here, <italic>x<sub>i</sub></italic>,<italic>y<sub>i</sub></italic>: 2D coordinates of the <italic>i</italic>th key point in the image and <italic>c<sub>i</sub></italic>: confidence score indicating the likelihood that the <italic>i</italic>th key point is correctly detected (<italic>c<sub>i</sub></italic> ∈ [<xref ref-type="bibr" rid="ref2">2</xref>]). So, the whole body can be represented as:</p>
        <disp-formula>
          <graphic xlink:href="aging_v9i1e80102_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>(5)</p>
        <p>where the total key point feature vector has 34 dimensions (17 key points × 2 coordinates).</p>
      </sec>
      <sec>
        <title>Depth Image Generation</title>
        <p>Skeleton data are highly robust but fail to capture the environmental context. In contrast, RGB data often include excessive information, which can lead to confusion due to varying lighting conditions. However, depth images offer valuable spatial insights, providing detailed information about the distance and positions of objects and body parts. Therefore, we generated depth maps from RGB images and incorporated them into our study. To achieve this, we used the “Depth Anything” model [<xref ref-type="bibr" rid="ref78">78</xref>], an advanced monocular depth estimation technique that transforms RGB images into depth maps. This model leverages deep learning (DL) to predict pixelwise depth values based on visual features in the image. Trained on extensive datasets of RGB-depth pairs, the model knows how to estimate depth using visual information alone. Each pixel in the image has 3 values corresponding to its red, green, and blue intensities. The goal of the model is to convert this 3-channel image into a depth map, where each pixel value corresponds to the distance from the camera to the object in the scene. Let <italic>f</italic><sub>θ</sub> represent the trained model that maps an RGB image IRGB to its predicted depth map <italic>D</italic>. The model was learned from large datasets containing paired RGB images and their corresponding depth maps during training. The depth estimation function is given by <italic>D</italic> = <italic>f</italic><sub>θ</sub>(<italic>I</italic><sub>RGB</sub>). Then, each pixel of the depth map represents that the estimated distance to the camera can be denoted by</p>
        <disp-formula>
          <graphic xlink:href="aging_v9i1e80102_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>(6)</p>
        <p>where (<italic>x, y</italic>) are the coordinates of the pixel in the image. The value <italic>D</italic> (<italic>x, y</italic>) represents the depth of the corresponding point in the scene.</p>
      </sec>
      <sec>
        <title>Care-Assessment-Aware Spatiotemporal Transformer</title>
        <p>Our proposed CSTT analyzes body key points, heatmaps, and CL to predict human activities in a personalized manner. As a spatiotemporal heterogeneous transformer, it captures spatial features (key points and heatmaps) and temporal dynamics (motion sequences over time) while integrating multiple modalities with distinct representations—numerical key points, image-based heatmaps, and scalar CL—through CAM. In the Spatial Transformer, key points and heatmaps are processed to extract movement-related features, with the CL serving as an attention guide, determining which body movements and heatmap regions are most relevant. This enables the model to dynamically prioritize motion patterns based on an individual’s care needs. The Temporal Transformer then analyzes the sequence of spatial features, capturing motion dynamics over time. Learning temporal dependencies helps recognize activities and transitions between postures. Finally, the extracted spatiotemporal features are passed to a classifier, which predicts the activity category.</p>
        <sec>
          <title>Spatial Transformer</title>
          <p>The Spatial Transformer extracts meaningful representations from the key points and heatmap while considering the CL as a crucial guiding factor. The CL attends to the key points and heatmap, allowing the model to emphasize relevant body parts or movements based on the older adult person’s health condition.</p>
        </sec>
        <sec>
          <title>Feature Embedding</title>
          <sec>
            <title>Overview</title>
            <p>To transform the raw input data into a meaningful and structured format suitable for further processing, we used an embedding mechanism. This step is crucial as it converts different types of input data—body key points, heatmaps, and CL—into a shared latent representation that can be effectively used by the transformer model. By mapping these diverse inputs into a common latent space of dimension <italic>d</italic>, the embedding process ensures that the model can seamlessly integrate and compare information from multiple modalities, facilitating efficient learning and cross-modal interactions. There are 3 types of feature embedding done here as mentioned in the following sections.</p>
          </sec>
          <sec>
            <title>Key Point Embedding</title>
            <p>The 2D pose key points (17 joints, each with (x,y) coordinates) are flattened and passed through a linear layer:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(7)</p>
            <p>Where <italic>X</italic> ∈ R<sup>B×17×2</sup> is the input key point tensor and <italic>W<sub>k</sub></italic> ∈ R<sup>(17×2)×</sup><italic><sup>d</sup></italic> is the weight matrix, which learns spatial dependencies. <italic>K</italic> ∈ R<italic><sup>B</sup></italic><sup>×</sup><italic><sup>d</sup></italic> is the key point embedding.</p>
          </sec>
          <sec>
            <title>Heatmap Embedding</title>
            <p>The heatmap (grayscale image of size 64 × 64) is processed through a 3D convolution layer to capture local spatial dependencies and activation patterns:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(8)</p>
            <p>where, <italic>D</italic> ∈ R<italic><sup>B</sup></italic><sup>×1×64×64</sup> is the heatmap input. <italic>H</italic> ∈ R<italic><sup>B</sup></italic><sup>×</sup><italic><sup>T</sup></italic><sup>×</sup><italic><sup>d</sup></italic> is the heatmap embedding after convolution, where <italic>T</italic> represents the number of spatial patches.</p>
          </sec>
          <sec>
            <title>CL Embedding</title>
            <p>The CL (a scalar value) is passed through a linear layer:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(9)</p>
            <p>Where <italic>c</italic> ∈ R<italic><sup>B</sup></italic><sup>×1</sup> is the CL tensor and <italic>Wc</italic> ∈ R<sup>1×</sup><italic><sup>d</sup></italic> is the weight matrix. <italic>C</italic> ∈ R<italic><sup>B</sup></italic><sup>×</sup><italic><sup>d</sup></italic> is the CL embedding.</p>
          </sec>
        </sec>
        <sec>
          <title>Care-Aware Attention Mechanism</title>
          <sec>
            <title>Overview</title>
            <p>Unlike conventional transformer self-attention mechanisms that treat all inputs equally, our proposed CAM dynamically adjusts attention based on CL. Using CL embeddings as queries in a multihead attention mechanism prioritizes key body parts and movements. Key points and heatmaps form key value pairs, allowing the model to focus on relevant features. The attended features are refined through a feedforward network, enhancing learning. By explicitly integrating CL into feature learning, CAM improves interpretability and efficiency, making activity recognition more personalized and accurate. The entire process is depicted as follows:</p>
          </sec>
          <sec>
            <title>Attention Score Computation</title>
            <p>The traditional attention mechanism follows the scaled dot product attention formulation:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(10)</p>
            <p>Here, we made the modification in the embeddings of <italic>Q</italic>, <italic>K</italic>, and <italic>V</italic> values. CL embedding <italic>C</italic> ∈ R<italic><sup>B</sup></italic><sup>×1×</sup><italic><sup>d</sup></italic> is assigned to <italic>Q</italic> (Query). Concatenated Key point + Heatmap Features [<italic>K</italic>,<italic>H</italic>] ∈ R<italic><sup>B</sup></italic><sup>×(</sup><italic><sup>T</sup></italic><sup>+1)×</sup><italic><sup>d</sup></italic> is assigned to both <italic>K</italic> (Key) and <italic>V</italic> (Value). So the modified attention weights computation formula is:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig19.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(11)</p>
            <p>This results in a weighted sum of key points and heatmap features, emphasizing relevant information based on the CL.</p>
          </sec>
          <sec>
            <title>Multihead Attention</title>
            <p>To enhance model expressiveness, we use multihead attention, where different heads capture different aspects of the input:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig20.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(12)</p>
            <p>where each head performs attention independently, and the outputs are concatenated and projected using <italic>W<sub>o</sub></italic>.</p>
          </sec>
          <sec>
            <title>Feedforward Network</title>
            <p>After attention, the output is passed through a feedforward network:</p>
            <disp-formula>
              <graphic xlink:href="aging_v9i1e80102_fig21.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>(13)</p>
            <p>This enhances the feature representation before passing it to the temporal transformer.</p>
          </sec>
        </sec>
        <sec>
          <title>Temporal Transformer</title>
          <p>The temporal transformer is designed to capture and model the sequential dependencies that exist across frames. Since traditional transformers do not inherently account for the order of sequences, we incorporate positional encoding to inject information about the temporal order of the frames, allowing the model to distinguish between the different points in time:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig22.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(14)</p>
          <p>Where ρ is a constant eprecally calculated and the value of ρ is 10,000. The Transformer Encoder then applies self-attention across the temporal dimension, processing a sequence of spatial features <italic>Z<sub>t</sub></italic> obtained from various timesteps. This process is mathematically represented as:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig23.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(15)</p>
          <p>Where <italic>Z<sub>t</sub></italic> ∈ R<italic><sup>T</sup></italic><sup>×</sup><italic><sup>d</sup></italic> is the temporal sequence of spatial representations and <inline-graphic xlink:href="aging_v9i1e80102_fig29.png" xlink:type="simple" mimetype="image"/> is the transformed sequence after passing through the encoder. Throughout this process, the model allows the frames to “attend” to one another, meaning each frame is evaluated in relation to others. This enables the model to capture the dependencies of motion, such as recognizing how a body part moves or changes over time, effectively tracking the progression of movement across frames. After the temporal sequence is processed, the features are aggregated using mean pooling to summarize the sequence of attended frames into a single fixed-size representation:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig24.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(16)</p>
          <p>This pooled feature vector encapsulates the entire movement sequence, providing a compact yet informative summary. Finally, this aggregated representation is passed into a classifier for further interpretation, enabling the model to make predictions based on the captured temporal dynamics.</p>
        </sec>
        <sec>
          <title>Classification</title>
          <p>The final feature vector encapsulates the entire activity, providing a contextual representation of the movement. After the pooled feature vector is obtained from the Temporal Transformer, it is passed through a softmax classifier for classification:</p>
          <disp-formula>
            <graphic xlink:href="aging_v9i1e80102_fig25.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>(17)</p>
          <p>In this equation, <italic>W</italic><sub>cis</sub> and <italic>b</italic><sub>cis</sub> are learnable parameters that the model optimizes during training. Specifically, <italic>W</italic><sub>cis</sub> is responsible for projecting the final feature vector into activity categories, effectively mapping the feature space to the set of possible classes. The softmax function then assigns probabilities to each of these activity categories, indicating the likelihood of each class being the correct one.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>In this work, we have proposed a care-assessment-aware approach for older adult activity recognition. This section presents the validation results for our dataset and evaluates the model’s performance. In addition, we have conducted an ablation study and provided a detailed discussion of the obtained results.</p>
      <sec>
        <title>Dataset Validation</title>
        <p>To assess the dataset and demonstrate the importance of integrating care context, we applied 2 similarity-based evaluation strategies. The first measures pairwise similarity across all care-level groups, shown in <xref ref-type="table" rid="table3">Table 3</xref> and the second examines pairwise similarity specifically between the low- and medium-care groups for each activity, shown in <xref ref-type="table" rid="table4">Table 4</xref>. Similarity was quantified using MPJAD, CS, and HOG similarity.</p>
        <p>From <xref ref-type="table" rid="table3">Table 3</xref>, it is evident that the movement patterns of eating and sitting activities in the high care assistance group differ significantly from those in the other 2 groups. Since individuals in this category are almost immobile, activities such as trying to stand up and standing up are absent. For the low and medium care assistance groups, we conducted a pairwise similarity analysis of activities, as we observed that the motion patterns of trying to stand up and standing up share some similarities and yet exhibit notable differences. From <xref ref-type="table" rid="table4">Table 4</xref>, we can see that, despite being different activities, certain motion patterns show a high degree of similarity. In particular, the pairs eating–trying to stand up and standing up–trying to stand up demonstrate strong similarities, which is uncommon in standard activity recognition scenarios. These findings confirm that care context plays a crucial role in older adult activity motion patterns. To strengthen our claim, we also analyzed motion patterns using a comparative group of young adults. Since the study primarily focuses on older adult participants, these additional results are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for reference. All these comparison results validate our claim that care information plays a significant role in accurately distinguishing activities.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Pairwise similarity calculation among different care level groups.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <col width="0"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="3">Mean per joint angle difference</td>
                <td colspan="4">Cosine similarity</td>
                <td colspan="3">Histogram of oriented gradients similarity</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><sup>a</sup>L-H</td>
                <td><sup>b</sup>H-M</td>
                <td>M-L<sup>c</sup></td>
                <td>L-H</td>
                <td>H-M</td>
                <td>M-L</td>
                <td colspan="2">L-H</td>
                <td>H-M</td>
                <td>M-L</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>E<sup>d</sup></td>
                <td>0.95</td>
                <td>0.92</td>
                <td>0.18</td>
                <td>0.95</td>
                <td>0.92</td>
                <td>0.16</td>
                <td colspan="2">0.07</td>
                <td>0.14</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>S<sup>e</sup></td>
                <td>0.95</td>
                <td>0.92</td>
                <td>0.24</td>
                <td>0.95</td>
                <td>0.92</td>
                <td>0.27</td>
                <td colspan="2">0.07</td>
                <td>0.14</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>T<sup>f</sup></td>
                <td>N/A<sup>g</sup></td>
                <td>N/A</td>
                <td>0.49</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>0.51</td>
                <td colspan="2">N/A</td>
                <td>N/A</td>
                <td>0.25</td>
              </tr>
              <tr valign="top">
                <td>SU<sup>h</sup></td>
                <td>N/A</td>
                <td>N/A</td>
                <td>0.78</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>0.80</td>
                <td colspan="2">N/A</td>
                <td>N/A</td>
                <td>0.34</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>L-H: low and high care–level pair.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>H-M: high and medium care–level pair.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>M-L: medium and low care–level pair.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>E: eating.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>S: sitting.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>T: trying to stand up.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>SU: stand up.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Activity pairwise similarity calculation between low and medium care–level groups. The first activity is from the medium group.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="250"/>
            <col width="250"/>
            <col width="260"/>
            <thead>
              <tr valign="top">
                <td>Activity pair</td>
                <td>Mean per joint angle difference</td>
                <td>Cosine similarity</td>
                <td>Histogram of oriented gradients similarity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>E<sup>a</sup>-S<sup>b</sup></td>
                <td>0.91</td>
                <td>0.93</td>
                <td>0.14</td>
              </tr>
              <tr valign="top">
                <td>E-T<sup>c</sup></td>
                <td>0.21</td>
                <td>0.23</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>E-SU<sup>d</sup></td>
                <td>0.56</td>
                <td>0.60</td>
                <td>0.72</td>
              </tr>
              <tr valign="top">
                <td>S-E</td>
                <td>0.92</td>
                <td>0.95</td>
                <td>0.12</td>
              </tr>
              <tr valign="top">
                <td>S-T</td>
                <td>0.55</td>
                <td>0.59</td>
                <td>0.71</td>
              </tr>
              <tr valign="top">
                <td>S-SU</td>
                <td>0.54</td>
                <td>0.59</td>
                <td>0.72</td>
              </tr>
              <tr valign="top">
                <td>T-E</td>
                <td>0.22</td>
                <td>0.24</td>
                <td>0.90</td>
              </tr>
              <tr valign="top">
                <td>T-S</td>
                <td>0.56</td>
                <td>0.61</td>
                <td>0.72</td>
              </tr>
              <tr valign="top">
                <td>T-SU</td>
                <td>0.21</td>
                <td>0.23</td>
                <td>0.88</td>
              </tr>
              <tr valign="top">
                <td>SU-E</td>
                <td>0.56</td>
                <td>0.59</td>
                <td>0.73</td>
              </tr>
              <tr valign="top">
                <td>SU-S</td>
                <td>0.51</td>
                <td>0.53</td>
                <td>0.71</td>
              </tr>
              <tr valign="top">
                <td>SU-T</td>
                <td>0.25</td>
                <td>0.27</td>
                <td>0.89</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>E: eating.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>S: sitting.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>T: trying to stand up.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>SU: stand up.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Performance Evaluation</title>
        <p>To evaluate the model’s performance, we analyzed the attention weight matrix (AWM), the receiver operating characteristic curve, and the cumulative gain plot. The AWM in <xref rid="figure3" ref-type="fig">Figure 3</xref> validates that the CSTT model efficiently integrates CL, body key points, and depth heatmaps through CAM. The model dynamically adjusts the importance assigned to different input features, demonstrating its ability to capture hierarchical dependencies. Notably, CL serves as a guiding factor, influencing how attention is distributed across other features. The receiver operating characteristic curve in <xref rid="figure2" ref-type="fig">Figure 2</xref> evaluates the model’s ability to distinguish activity classes by plotting the true-positive rate versus the false-positive rate. The area under the curve (AUC) values range from 0.95 to 0.98, indicating excellent classification performance. The microaverage AUC (0.98) reflects strong overall accuracy, while the macroaverage AUC (0.97) shows balanced performance across classes. Classes 0, 1, and 2 have the highest AUC values, ensuring clear separation, while class 3 (0.95) shows slight overlap but still performs well. The cumulative gain plot in <xref rid="figure4" ref-type="fig">Figure 4</xref> highlights the model’s ability to rank correct predictions early. The steep rise in curves shows effective prioritization. This confirms that the CSTT efficiently ranks true positives early, which is critical for applications needing confident and rapid classification.</p>
        <p>We selected both machine learning and DL models as baselines to evaluate our approach. <xref ref-type="table" rid="table5">Table 5</xref> presents the results with and without CL information, except for our model, where care context is essential when the data are benchmarked using a cross-day approach, applying an 80-20 train-test split. Machine learning models used only skeleton data, resulting in comparatively lower performance. However, adding just care context significantly improved all models’ performance. Our model outperformed the traditional spatiotemporal transformer, achieving a 5% higher accuracy and a 9% increased <italic>F</italic><sub>1</sub>-score.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Model performance evaluation: the cumulative gain plot (one vs all) illustrates care-assessment-aware spatiotemporal transformer’s prediction accuracy across different activities (class 0: sitting, class 1: eating, class 2: stand up, and class 3: trying to stand up). Here, red, orange, green, blue, and dotted lines represent class 3, class 1, class 2, class 0, and baseline accordingly.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Comparison with baseline methods.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="110"/>
            <col width="100"/>
            <col width="110"/>
            <col width="100"/>
            <col width="0"/>
            <col width="100"/>
            <col width="100"/>
            <col width="110"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td colspan="5">Without care (C) level</td>
                <td colspan="4">With care (C) level</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>P<sup>a</sup></td>
                <td>R<sup>b</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>A<sup>c</sup></td>
                <td colspan="2">P</td>
                <td>R</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>A</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Random Forest (S<sup>d</sup>)</td>
                <td>0.58</td>
                <td>0.65</td>
                <td>0.56</td>
                <td>0.65</td>
                <td colspan="2">0.69</td>
                <td>0.69</td>
                <td>0.69</td>
                <td>0.69</td>
              </tr>
              <tr valign="top">
                <td>XG Boost (S)</td>
                <td>0.67</td>
                <td>0.69</td>
                <td>0.66</td>
                <td>0.69</td>
                <td colspan="2">0.74</td>
                <td>0.74</td>
                <td>0.74</td>
                <td>0.74</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>e</sup> (S + D<sup>f</sup>)</td>
                <td>0.76</td>
                <td>0.75</td>
                <td>0.76</td>
                <td>0.77</td>
                <td colspan="2">0.82</td>
                <td>0.82</td>
                <td>0.81</td>
                <td>0.80</td>
              </tr>
              <tr valign="top">
                <td>ResNet (S + D)</td>
                <td>0.80</td>
                <td>0.74</td>
                <td>0.76</td>
                <td>0.86</td>
                <td colspan="2">0.85</td>
                <td>0.85</td>
                <td>0.85</td>
                <td>0.85</td>
              </tr>
              <tr valign="top">
                <td>TSTT<sup>g</sup> (S + D)</td>
                <td>0.82</td>
                <td>0.83</td>
                <td>0.82</td>
                <td>0.83</td>
                <td colspan="2">0.83</td>
                <td>0.91</td>
                <td>0.87</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>Our model</td>
                <td>N/A<sup>h</sup></td>
                <td>N/A</td>
                <td>N/A</td>
                <td>N/A</td>
                <td colspan="2">
                  <italic>0.97</italic>
                  <sup>i</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>i</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>i</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>i</sup>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>R: recall.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>A: accuracy.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>S: skeleton data.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>D: depth image.</p>
            </fn>
            <fn id="table5fn7">
              <p><sup>g</sup>TSTT: traditional spatiotemporal transformer.</p>
            </fn>
            <fn id="table5fn8">
              <p><sup>h</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table5fn9">
              <p><sup>i</sup>Best performance results values are in italics to highlight the importance.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ablation Study</title>
        <p>In the ablation study, we removed components from the key and value pairs to analyze their impact. Also, we have randomly removed some key points to create partial skeleton data. We compared the traditional spatiotemporal transformer without the CAM layer, a care-assessment-aware transformer using only skeleton data (CSTT [S + C]), a care-assessment-aware transformer using only partial skeleton data (CSTT [Sp + C]), one using only depth images (CSTT [D + C]), our full model integrating partial skeleton data (CSTT [Sp + D + C]), and our full model integrating all (CSTT [S + D + C]). As shown in <xref ref-type="table" rid="table6">Table 6</xref>, CSTT (S + C) outperforms CSTT (D + C), aligning with the AWM analysis in <xref rid="figure3" ref-type="fig">Figure 3</xref>. Also, CSTT (Sp + C) performs poorly compared with others, but CSTT (Sp + D + C) performance is in the acceptable range. This shows that the model is comparatively robust.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Ablation study comparison.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>P<sup>a</sup></td>
                <td>R<sup>b</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>A<sup>c</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>TSTT<sup>d</sup> (S<sup>e</sup> + D<sup>f</sup>)</td>
                <td>0.82</td>
                <td>0.83</td>
                <td>0.82</td>
                <td>0.83</td>
              </tr>
              <tr valign="top">
                <td>TSTT (S + D + C<sup>g</sup>)</td>
                <td>0.83</td>
                <td>0.91</td>
                <td>0.87</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>CSTT<sup>h</sup> (S + C)</td>
                <td>0.94</td>
                <td>0.84</td>
                <td>0.88</td>
                <td>0.93</td>
              </tr>
              <tr valign="top">
                <td>CSTT (Sp<sup>i</sup> + C)</td>
                <td>0.81</td>
                <td>0.88</td>
                <td>0.84</td>
                <td>0.85</td>
              </tr>
              <tr valign="top">
                <td>CSTT (D + C)</td>
                <td>0.91</td>
                <td>0.82</td>
                <td>0.84</td>
                <td>0.90</td>
              </tr>
              <tr valign="top">
                <td>CSTT (Sp + D + C)</td>
                <td>0.92</td>
                <td>0.90</td>
                <td>0.91</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>Our model</td>
                <td>
                  <italic>0.97</italic>
                  <sup>j</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>j</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>j</sup>
                </td>
                <td>
                  <italic>0.96</italic>
                  <sup>j</sup>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>R: recall.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>A: accuracy.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>TSTT: traditional spatiotemporal transformer.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>S: skeleton data.</p>
            </fn>
            <fn id="table6fn6">
              <p><sup>f</sup>D: depth image.</p>
            </fn>
            <fn id="table6fn7">
              <p><sup>g</sup>C: care context.</p>
            </fn>
            <fn id="table6fn8">
              <p><sup>h</sup>CSTT: care-assessment-aware spatiotemporal transformer.</p>
            </fn>
            <fn id="table6fn9">
              <p><sup>i</sup>Sp: partial skeleton data.</p>
            </fn>
            <fn id="table6fn10">
              <p><sup>j</sup>Best performance results values are in italics to highlight the importance.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Model performance evaluation: attention weight matrix demonstrates the effective incorporation of care level through care-aware attention mechanism in care-assessment-aware spatiotemporal transformer.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Deployment</title>
        <p>All experiments were implemented using the PyTorch DL framework and executed on an NVIDIA RTX 4080 GPU. The models were trained with a batch size of 8, an initial learning rate of 0.001, a weight decay of 0.0005, and for a total of 25 epochs. The trained model achieved an inference speed of 0.0043 seconds per sample, with approximately 1.2 M parameters and 1.09 Giga Floating-Point Operations per Second, indicating high computational efficiency.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study demonstrated that older adult activity recognition can be significantly improved by integrating care-level information into model design. The proposed CSTT effectively captured variations in motion patterns caused by differing health conditions and care requirements. Results revealed that even for the same activity, movements differed across CLs, while visually similar motions appeared in distinct activities (eg, eating vs trying to stand up). Despite class imbalance and naturally occurring variations, CSTT achieved high recognition performance. Misclassifications—such as confusion between eating and sitting—were primarily linked to overlapping movements during caregiver assistance. These findings validate that incorporating care assessment information enables more robust, context-aware, and personalized activity recognition, aligning closely with real-world monitoring needs in long-term older adult care environments.</p>
      </sec>
      <sec>
        <title>Care-Assessment-Aware Older Adult Activity Recognition</title>
        <p>Older adult activity recognition presents unique challenges due to the pose deformities, motion limitations, and variations in mobility caused by differing health conditions. Traditional HAR models often fail to capture these subtleties, as they are designed primarily for younger, healthy individuals and do not incorporate health-related variations. Furthermore, existing HAR approaches used for monitoring predominantly rely on RGB-based methods or, to a lesser extent, skeleton data, both of which struggle with computational efficiency and real-time applicability in older adult care environments. Our goal is to develop a context-aware HAR model that integrates CL assessments with motion data for personalized and accurate older adult activity recognition. To achieve this, we introduced CSTT, a heterogeneous spatiotemporal motion transformer incorporating skeleton data, depth-based heatmaps, and CL information, with CAM for improved adaptability. To ensure that the model captures real older adult activity patterns, we collected a motion dataset with CL annotations, recording uninterrupted mealtime sessions in a care facility for greater ecological validity. For model training, we focused on 4 key activities: eating, sitting, trying to stand up, and standing up. These were selected based on real nursing challenges—monitoring whether individuals are eating or sitting idle (requiring intervention if necessary) and ensuring continuous supervision during standing attempts to prevent falls. Unlike controlled datasets, our activity selection reflects real-world monitoring needs, capturing long-range dependencies in continuous time data, which aligns more closely with practical older adult care scenarios.</p>
      </sec>
      <sec>
        <title>Performance of Care-Assessment-Aware Older Adult Activity Recognition</title>
        <p>The results in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> support our initial hypothesis that older adult activity recognition differs from conventional methods, as their motion patterns are heavily influenced by their health conditions. Even for the same activity, movement can vary significantly across different CLs, while similar motion patterns may appear in completely different activities (eg, eating and trying to stand up). Our data collection was conducted without intervention to preserve real-world conditions. However, the dataset is highly imbalanced, particularly for the “trying to stand up” activity, which has significantly fewer samples than the other 3 classes. This reflects real-world scenarios but presents a challenge for recognition models. From <xref rid="figure5" ref-type="fig">Figure 5</xref>, the prediction accuracy for this class is 0.95, which is slightly lower than others. Among all classes, “eating” has the lowest accuracy at 0.93. Examining the confusion matrices provides insights into this misclassification. In <xref rid="figure6" ref-type="fig">Figure 6</xref>, eating is frequently confused with sitting, which is reasonable since caregivers often assist older adult individuals, reducing distinct motion patterns between these activities. In <xref rid="figure7" ref-type="fig">Figure 7</xref>, eating is often mistaken for trying to stand up, likely because older adult individuals extend their hands and lean forward for support when getting up—movements similar to hovering over a tray to pick up food. However, as seen in <xref rid="figure8" ref-type="fig">Figure 8</xref>, low care assistance individuals do not exhibit such overlapping movements, leading to perfect recognition of the eating activity. Based on overall performance, our model successfully outperforms the baselines.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Normalized confusion matrix for all groups together.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Normalized confusion matrix for high care assistance required group.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Normalized confusion matrix for medium care assistance required group.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Normalized confusion matrix for low care assistance required group.</p>
          </caption>
          <graphic xlink:href="aging_v9i1e80102_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Current Limitations and Future Work</title>
        <p>Although our dataset reflects real-world scenarios, it remains imbalanced, with activities such as “trying to stand up” having fewer samples. This affects classification accuracy (<xref rid="figure5" ref-type="fig">Figure 5</xref>), where “trying to stand up” achieves 0.95 and “eating” the lowest at 0.93 due to motion similarities, especially with caregiver assistance. We did not apply any data imbalance handling techniques, as “trying to stand up” naturally occurs less frequently in real-world settings. We did not apply data augmentation to address class imbalance because in older adult care monitoring, maintaining natural motion patterns and ecological validity is crucial. In cases of severe imbalance, augmentation risks producing unrealistic or biased samples, distorting class distributions, and leading to overfitting to synthetic data. The absence of CL annotations in existing datasets restricted direct comparisons with previous older adult HAR approaches. Although we manually annotated CLs for the ETRI Activity3D and Toyota Smarthome datasets to enable comparison, the potential inaccuracy of these inferred labels and the no representation of high-assistance older adult groups led us to include these results in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> rather than in the main manuscript. Future work will focus on annotating CLs in public datasets with medical professionals for comparative analysis. Due to ethical restrictions, our dataset cannot be publicly shared but may be accessed through collaboration. To support replication, we provide references and guidelines for collecting similar datasets with integrated CL information. Since our model depends on accurate CL assessments, misclassification can impact recognition, emphasizing the importance of expert collaboration in dataset development.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this work, we address the challenge of older adult activity recognition by considering the variability in movements influenced by CLs, which is often overlooked in existing datasets. To overcome this, we introduced a novel older adult motion dataset that includes CL information, collected from 51 older adult participants during real-world mealtime sessions in an ethical and privacy-preserving manner. We proposed CSTT, a spatiotemporal heterogeneous transformer model that integrates body key points, heatmaps, and CLs to predict activities in a personalized way by dynamically adjusting attention based on CLs. Our model surpassed the conventional spatiotemporal transformer, achieving a 5% higher accuracy and a 9% improvement in <italic>F</italic><sub>1</sub>-score, highlighting the significance of incorporating CL data. However, limitations include dataset imbalance and the inability to compare with similar works due to a lack of comparable datasets, as well as ethical restrictions on sharing the dataset. In conclusion, our work lays the foundation for more accurate, context-aware older adult activity recognition, with future research focusing on dataset expansion, model refinement, and real-world applications in care settings.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Validation of care aware data collection and dataset reproducibility resources.</p>
        <media xlink:href="aging_v9i1e80102_app1.docx" xlink:title="DOCX File , 242 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Generalization of care-assessment-aware spatiotemporal transformer by comparing performance on other data sets.</p>
        <media xlink:href="aging_v9i1e80102_app2.docx" xlink:title="DOCX File , 18 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AWM</term>
          <def>
            <p>attention weight matrix</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CAM</term>
          <def>
            <p>care-aware attention mechanism</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CAS</term>
          <def>
            <p>care assessment score</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CL</term>
          <def>
            <p>care level</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CS</term>
          <def>
            <p>cosine similarity</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">CSTT</term>
          <def>
            <p>care-assessment-aware spatiotemporal transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">HAR</term>
          <def>
            <p>human activity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">HOG</term>
          <def>
            <p>histogram of oriented gradients</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">LTC</term>
          <def>
            <p>long-term care</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">MPJAD</term>
          <def>
            <p>mean per joint angle difference</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">YOLO</term>
          <def>
            <p>You Only Look Once</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was supported by the JST Challenging Research Program for Next Generation Researchers (grant JPMJSP2154). Grammarly AI and Quilbot AI have been used for grammatical corrections and paraphrasing assistance. The depth images used in this study were generated using DepthAnything model. The details are provided in “Methods” (B. Depth Image Generation) section.</p>
    </ack>
    <notes>
      <sec>
        <title>Funding</title>
        <p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this article.</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data are not publicly available but they would be shared with upon the collaboration request only for research purpose to the applicants who adhere to the ethical and privacy policy of the ethical committee after careful consideration. Also, we have provided resources for reproducibility in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>World report on ageing and health</article-title>
          <source>World Health Organization</source>
          <year>2015</year>
          <access-date>2025-12-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/publications/i/item/9789241565042">https://www.who.int/publications/i/item/9789241565042</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McNicoll</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>World population ageing 1950-2050</source>
          <year>2002</year>
          <access-date>2025-12-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/rwe/10.1007/978-981-99-7842-7_154">https://link.springer.com/rwe/10.1007/978-981-99-7842-7_154</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gaudenz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Geest</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schwendimann</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zúñiga</surname>
              <given-names>Franziska</given-names>
            </name>
          </person-group>
          <article-title>Factors associated with care workers' intention to leave employment in nursing homes: a secondary data analysis of the Swiss nursing homes human resources project</article-title>
          <source>J Appl Gerontol</source>
          <year>2019</year>
          <month>11</month>
          <volume>38</volume>
          <issue>11</issue>
          <fpage>1537</fpage>
          <lpage>1563</lpage>
          <pub-id pub-id-type="doi">10.1177/0733464817721111</pub-id>
          <pub-id pub-id-type="medline">28715925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leiter</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Maslach</surname>
              <given-names>Christina</given-names>
            </name>
          </person-group>
          <article-title>Nurse turnover: the mediating role of burnout</article-title>
          <source>J Nurs Manag</source>
          <year>2009</year>
          <month>04</month>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>331</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2834.2009.01004.x</pub-id>
          <pub-id pub-id-type="medline">19426369</pub-id>
          <pub-id pub-id-type="pii">JNM1004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Impact of resilience and job involvement on turnover intention of new graduate nurses using structural equation modeling</article-title>
          <source>Jpn J Nurs Sci</source>
          <year>2018</year>
          <month>10</month>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>351</fpage>
          <lpage>362</lpage>
          <pub-id pub-id-type="doi">10.1111/jjns.12210</pub-id>
          <pub-id pub-id-type="medline">29508523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Do type A personality and neuroticism moderate the relationships of occupational stressors, job satisfaction and burnout among Chinese older nurses? A cross-sectional survey</article-title>
          <source>BMC Nurs</source>
          <year>2022</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>88</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcnurs.biomedcentral.com/articles/10.1186/s12912-022-00865-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12912-022-00865-7</pub-id>
          <pub-id pub-id-type="medline">35428288</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12912-022-00865-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC9013170</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Siciliano</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Burrage</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Chronic pain in the elderly: a continuing education program for certified nursing assistants</article-title>
          <source>Geriatr Nurs</source>
          <year>2005</year>
          <volume>26</volume>
          <issue>4</issue>
          <fpage>252</fpage>
          <lpage>258</lpage>
          <pub-id pub-id-type="doi">10.1016/j.gerinurse.2005.05.008</pub-id>
          <pub-id pub-id-type="medline">16109299</pub-id>
          <pub-id pub-id-type="pii">S0197-4572(05)00132-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Perception, behavior and experience of nursing assistants towards pain of older adults with dementia: a qualitative study</article-title>
          <source>Geriatr Nurs</source>
          <year>2024</year>
          <volume>56</volume>
          <fpage>100</fpage>
          <lpage>107</lpage>
          <pub-id pub-id-type="doi">10.1016/j.gerinurse.2024.02.013</pub-id>
          <pub-id pub-id-type="medline">38340431</pub-id>
          <pub-id pub-id-type="pii">S0197-4572(24)00023-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holloway</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>McConigley</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Understanding nursing assistants' experiences of caring for older people in pain: the Australian experience</article-title>
          <source>Pain Manag Nurs</source>
          <year>2009</year>
          <month>06</month>
          <volume>10</volume>
          <issue>2</issue>
          <fpage>99</fpage>
          <lpage>106</lpage>
          <pub-id pub-id-type="doi">10.1016/j.pmn.2008.10.001</pub-id>
          <pub-id pub-id-type="medline">19481049</pub-id>
          <pub-id pub-id-type="pii">S1524-9042(08)00154-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dobbs</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Carrion</surname>
              <given-names>IV</given-names>
            </name>
            <name name-style="western">
              <surname>Vongxaiburana</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hyer</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Certified nursing assistants' perspectives of nursing home residents' pain experience: communication patterns, cultural context, and the role of empathy</article-title>
          <source>Pain Manag Nurs</source>
          <year>2014</year>
          <month>03</month>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>87</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1016/j.pmn.2012.06.008</pub-id>
          <pub-id pub-id-type="medline">24602428</pub-id>
          <pub-id pub-id-type="pii">S1524-9042(12)00087-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Halifax</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Miaskowski</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wallhagen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Certified nursing assistants' understanding of nursing home residents' pain</article-title>
          <source>J Gerontol Nurs</source>
          <year>2018</year>
          <volume>44</volume>
          <issue>4</issue>
          <fpage>29</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.3928/00989134-20180131-01</pub-id>
          <pub-id pub-id-type="medline">29437185</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jansen</surname>
              <given-names>BDW</given-names>
            </name>
            <name name-style="western">
              <surname>Brazil</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Passmore</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Buchanan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Maxwell</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>McIlfatrick</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Morgan</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Watson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Parsons</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Exploring healthcare assistants' role and experience in pain assessment and management for people with advanced dementia towards the end of life: a qualitative study</article-title>
          <source>BMC Palliat Care</source>
          <year>2017</year>
          <volume>16</volume>
          <issue>1</issue>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpalliatcare.biomedcentral.com/articles/10.1186/s12904-017-0184-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12904-017-0184-1</pub-id>
          <pub-id pub-id-type="medline">28103847</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12904-017-0184-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC5247820</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lohne</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Høy</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lillestø</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sæteren</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Heggestad</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Aasgaard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Caspari</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rehnsfeldt</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Råholm</surname>
              <given-names>M-B</given-names>
            </name>
            <name name-style="western">
              <surname>Slettebø</surname>
              <given-names>Å</given-names>
            </name>
            <name name-style="western">
              <surname>Lindwall</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nåden</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Fostering dignity in the care of nursing home residents through slow caring</article-title>
          <source>Nurs Ethics</source>
          <year>2017</year>
          <month>11</month>
          <volume>24</volume>
          <issue>7</issue>
          <fpage>778</fpage>
          <lpage>788</lpage>
          <pub-id pub-id-type="doi">10.1177/0969733015627297</pub-id>
          <pub-id pub-id-type="medline">26850071</pub-id>
          <pub-id pub-id-type="pii">0969733015627297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moilanen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kangasniemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Papinaho</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mynttinen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Siipi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Suominen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Suhonen</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Older people's perceived autonomy in residential care: an integrative review</article-title>
          <source>Nurs Ethics</source>
          <year>2021</year>
          <volume>28</volume>
          <issue>3</issue>
          <fpage>414</fpage>
          <lpage>434</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/0969733020948115?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0969733020948115</pub-id>
          <pub-id pub-id-type="medline">33000683</pub-id>
          <pub-id pub-id-type="pmcid">PMC8151558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Preshaw</surname>
              <given-names>DH</given-names>
            </name>
            <name name-style="western">
              <surname>Brazil</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>McLaughlin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Frolic</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Ethical issues experienced by healthcare workers in nursing homes: literature review</article-title>
          <source>Nurs Ethics</source>
          <year>2016</year>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>490</fpage>
          <lpage>506</lpage>
          <pub-id pub-id-type="doi">10.1177/0969733015576357</pub-id>
          <pub-id pub-id-type="medline">25870176</pub-id>
          <pub-id pub-id-type="pii">0969733015576357</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goethals</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gastmans</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>de Casterlé</surname>
              <given-names>B D</given-names>
            </name>
          </person-group>
          <article-title>Nurses' ethical reasoning and behaviour: a literature review</article-title>
          <source>Int J Nurs Stud</source>
          <year>2010</year>
          <volume>47</volume>
          <issue>5</issue>
          <fpage>635</fpage>
          <lpage>650</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijnurstu.2009.12.010</pub-id>
          <pub-id pub-id-type="medline">20096413</pub-id>
          <pub-id pub-id-type="pii">S0020-7489(09)00402-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Woods</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pratt</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Awareness in dementia: ethical and legal issues in relation to people with dementia</article-title>
          <source>Aging Ment Health</source>
          <year>2005</year>
          <volume>9</volume>
          <issue>5</issue>
          <fpage>423</fpage>
          <lpage>429</lpage>
          <pub-id pub-id-type="doi">10.1080/13607860500143125</pub-id>
          <pub-id pub-id-type="medline">16024401</pub-id>
          <pub-id pub-id-type="pii">J4286115H743K114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heggestad</surname>
              <given-names>AKT</given-names>
            </name>
            <name name-style="western">
              <surname>Nortvedt</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Slettebø</surname>
              <given-names>Å</given-names>
            </name>
          </person-group>
          <article-title>Dignity and care for people with dementia living in nursing homes</article-title>
          <source>Dementia (London)</source>
          <year>2015</year>
          <volume>14</volume>
          <issue>6</issue>
          <fpage>825</fpage>
          <lpage>841</lpage>
          <pub-id pub-id-type="doi">10.1177/1471301213512840</pub-id>
          <pub-id pub-id-type="medline">24381212</pub-id>
          <pub-id pub-id-type="pii">1471301213512840</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Antwi</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Bowblis</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>The impact of nurse turnover on quality of care and mortality in nursing homes: evidence from the great recession</article-title>
          <source>Am J Health Econ</source>
          <year>2018</year>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>131</fpage>
          <lpage>163</lpage>
          <pub-id pub-id-type="doi">10.1162/ajhe_a_00096</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castle</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Engberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Men</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Nursing home staff turnover: impact on nursing home compare quality measures</article-title>
          <source>Gerontologist</source>
          <year>2007</year>
          <volume>47</volume>
          <issue>5</issue>
          <fpage>650</fpage>
          <lpage>651</lpage>
          <pub-id pub-id-type="doi">10.1093/geront/47.5.650</pub-id>
          <pub-id pub-id-type="medline">17989407</pub-id>
          <pub-id pub-id-type="pii">47/5/650</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>N(</given-names>
            </name>
          </person-group>
          <article-title>Substantial differences in turnover intention between direct care workers in chinese hospitals and long-term care facilities</article-title>
          <source>J Am Med Dir Assoc</source>
          <year>2021</year>
          <volume>22</volume>
          <issue>3</issue>
          <fpage>696</fpage>
          <lpage>700</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jamda.2020.09.006</pub-id>
          <pub-id pub-id-type="medline">33097399</pub-id>
          <pub-id pub-id-type="pii">S1525-8610(20)30786-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krein</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Turnwald</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Maust</surname>
              <given-names>DT</given-names>
            </name>
          </person-group>
          <article-title>"Sometimes it's not about the money... it's the way you treat people...": a qualitative study of nursing home staff turnover</article-title>
          <source>J Am Med Dir Assoc</source>
          <year>2022</year>
          <volume>23</volume>
          <issue>7</issue>
          <fpage>1178</fpage>
          <lpage>1184</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1525-8610(21)01063-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jamda.2021.11.036</pub-id>
          <pub-id pub-id-type="medline">34990586</pub-id>
          <pub-id pub-id-type="pii">S1525-8610(21)01063-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Midje</surname>
              <given-names>HH</given-names>
            </name>
            <name name-style="western">
              <surname>Nyborg</surname>
              <given-names>VN</given-names>
            </name>
            <name name-style="western">
              <surname>Nordsteien</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Øvergård</surname>
              <given-names>K I</given-names>
            </name>
            <name name-style="western">
              <surname>Brembo</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Torp</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Antecedents and outcomes of work engagement among nursing staff in long-term care facilities—a systematic review</article-title>
          <source>J Adv Nurs</source>
          <year>2024</year>
          <volume>80</volume>
          <issue>1</issue>
          <fpage>42</fpage>
          <lpage>59</lpage>
          <pub-id pub-id-type="doi">10.1111/jan.15804</pub-id>
          <pub-id pub-id-type="medline">37519065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heiks</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sabine</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Long term care and skilled nursing facilities</article-title>
          <source>Dela J Public Health</source>
          <year>2022</year>
          <volume>8</volume>
          <issue>5</issue>
          <fpage>144</fpage>
          <lpage>149</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36751604"/>
          </comment>
          <pub-id pub-id-type="doi">10.32481/djph.2022.12.032</pub-id>
          <pub-id pub-id-type="medline">36751604</pub-id>
          <pub-id pub-id-type="pii">djph-85-032</pub-id>
          <pub-id pub-id-type="pmcid">PMC9894029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Connie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>MKO</given-names>
            </name>
            <name name-style="western">
              <surname>Saedon</surname>
              <given-names>N?</given-names>
            </name>
          </person-group>
          <article-title>Model-Based Feature Extraction and Classification for Parkinson Disease Screening Using Gait Analysis: Development and Validation Study</article-title>
          <source>JMIR Aging</source>
          <year>2025</year>
          <volume>8</volume>
          <fpage>e65629</fpage>
          <pub-id pub-id-type="doi">10.2196/65629</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>S van Schooten</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Machine Learning Approach for Frailty Detection in Long-Term Care Using Accelerometer-Measured Gait and Daily Physical Activity: Model Development and Validation Study</article-title>
          <source>JMIR Aging</source>
          <year>2025</year>
          <volume>8</volume>
          <fpage>e77140</fpage>
          <pub-id pub-id-type="doi">10.2196/77140</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Host–parasite: Graph LSTM-in-LSTM for group activity recognition</article-title>
          <source>IEEE Trans Neural Netw Learn Syst</source>
          <year>2021</year>
          <volume>32</volume>
          <issue>2</issue>
          <fpage>663</fpage>
          <lpage>674</lpage>
          <pub-id pub-id-type="doi">10.1109/tnnls.2020.2978942</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Collaborative spatiotemporal feature learning for video action recognition</article-title>
          <year>2019</year>
          <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 15-20, 2019</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2019.00806</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Hierarchical long short-term concurrent memory for human interaction recognition</article-title>
          <year>2021</year>
          <conf-name>IEEE Transactions on Pattern Analysis and Machine Intelligence</conf-name>
          <conf-date>March 1, 2021</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <fpage>1110</fpage>
          <lpage>1118</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2019.2942030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>View adaptive neural networks for high performance skeleton-based human action recognition</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2019</year>
          <volume>41</volume>
          <issue>8</issue>
          <fpage>1963</fpage>
          <lpage>1978</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2019.2896631</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Skeleton-based action recognition with shift graph convolutional network</article-title>
          <year>2025</year>
          <conf-name>In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 13-19, 2020</conf-date>
          <conf-loc>Seattle, WA</conf-loc>
          <fpage>183</fpage>
          <lpage>192</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr42600.2020.00026</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Spatiotemporal co-attention recurrent neural networks for human-skeleton motion prediction</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2022</year>
          <volume>44</volume>
          <issue>6</issue>
          <fpage>3300</fpage>
          <lpage>3315</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2021.3050918</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Moulin</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>RGBD-HuDaAct: a color-depth video database for human daily activity recognition</article-title>
          <year>2011</year>
          <conf-name>IEEE International Conference on Computer Vision Workshops (ICCV Workshops)</conf-name>
          <conf-date>November 6-13, 2011</conf-date>
          <conf-loc>Barcelona</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1147</fpage>
          <lpage>1153</lpage>
          <pub-id pub-id-type="doi">10.1109/iccvw.2011.6130379</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oreifej</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>HON4D: histogram of oriented 4D normals for activity recognition from depth sequences</article-title>
          <year>2013</year>
          <conf-name>2013 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 23-28, 2018</conf-date>
          <conf-loc>Portland, OR, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>716</fpage>
          <lpage>723</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2013.98</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Mining actionlet ensemble for action recognition with depth cameras</article-title>
          <year>2012</year>
          <conf-name>2012 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 16-21, 2012</conf-date>
          <conf-loc>Providence, RI, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1290</fpage>
          <lpage>1297</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2012.6247813</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shahroudy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kot</surname>
              <given-names>AC</given-names>
            </name>
          </person-group>
          <article-title>NTU RGB+D 120: a large-scale benchmark for 3D human activity understanding</article-title>
          <source>IEEE Trans Pattern Anal Machine Intell</source>
          <year>2020</year>
          <volume>42</volume>
          <issue>10</issue>
          <fpage>2684</fpage>
          <lpage>2701</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2019.2916873</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jafari</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kehtarnavaz</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>UTD-MHAD: A multimodal dataset for human action recognition utilizing a depth camera and a wearable inertial sensor</article-title>
          <year>2015</year>
          <month>12</month>
          <day>10</day>
          <conf-name>2015 IEEE International Conference on Image Processing (ICIP)</conf-name>
          <conf-date>September 27-30, 2015</conf-date>
          <conf-loc>Quebec City, QC, Canada</conf-loc>
          <fpage>168</fpage>
          <lpage>172</lpage>
          <pub-id pub-id-type="doi">10.1109/icip.2015.7350781</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>GJ</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Concurrence-aware long short-term sub-memories for person-person action recognition</article-title>
          <year>2017</year>
          <month>08</month>
          <day>24</day>
          <conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</conf-name>
          <conf-date>July 21-26, 2017</conf-date>
          <conf-loc>Honolulu, HI, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1109/cvprw.2017.270</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Participation-contributed temporal dynamic model for group activity recognition</article-title>
          <year>2018</year>
          <month>10</month>
          <day>15</day>
          <conf-name>Proceedings of the 26th ACM International Conference on Multimedia</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>New York, NY, USA</conf-loc>
          <fpage>1292</fpage>
          <lpage>1300</lpage>
          <pub-id pub-id-type="doi">10.1145/3240508.3240572</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>HiGCIN: hierarchical graph-based cross inference network for group activity recognition</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2023</year>
          <volume>45</volume>
          <issue>6</issue>
          <fpage>6955</fpage>
          <lpage>6968</lpage>
          <pub-id pub-id-type="doi">10.1109/TPAMI.2020.3034233</pub-id>
          <pub-id pub-id-type="medline">33108281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Bi-Modal progressive mask attention for fine-grained recognition</article-title>
          <source>IEEE Trans Image Process</source>
          <year>2020</year>
          <volume>29</volume>
          <fpage>7006</fpage>
          <lpage>7018</lpage>
          <pub-id pub-id-type="doi">10.1109/tip.2020.2996736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Cross fusion for egocentric interactive action recognition</article-title>
          <year>2020</year>
          <conf-name>MultiMedia Modeling: 26th International Conference, MMM 2020, Proceedings, Part I</conf-name>
          <conf-date>January 5-8, 2020</conf-date>
          <conf-loc>Daejeon, South Korea</conf-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>714</fpage>
          <lpage>726</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-37731-1_58</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Simonyan</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Two-stream convolutional networks for action recognition in videos</article-title>
          <source>In Proceedings of the 28th International Conference on Neural Information Processing Systems</source>
          <year>2014</year>
          <volume>Volume 1</volume>
          <fpage>568</fpage>
          <lpage>576</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Crasto</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Weinzaepfel</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Alahari</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schmid</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>MARS: Motion-augmented RGB stream for action recognition</article-title>
          <year>2019</year>
          <month>01</month>
          <day>09</day>
          <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 15-20, 2019</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>7882</fpage>
          <lpage>7891</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2019.00807</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Diba</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fayyaz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Karami</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Arzani</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Yousefzadeh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Van Gool</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Temporal 3d convnets: new architecture and transfer learning for video classification</article-title>
          <source>ArXiv. Preprint posted online on November 22, 2017</source>
          <year>2017</year>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S-F</given-names>
            </name>
            <name name-style="western">
              <surname>Paluri</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Convnet architecture search for spatiotemporal feature learning</article-title>
          <source>arXiv. Preprint posted online on August 16, 2017</source>
          <year>2017</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.1708.05038</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carreira</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zisserman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Quo vadis, action recognition? A new model and the kinetics dataset</article-title>
          <year>2017</year>
          <month>11</month>
          <day>09</day>
          <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>July 21-26, 2017</conf-date>
          <conf-loc>Honolulu, Hawaii, USA</conf-loc>
          <fpage>6299</fpage>
          <lpage>6308</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.502</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bourdev</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fergus</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Torresani</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Paluri</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Learning spatiotemporal features with 3D convolutional networks</article-title>
          <year>2015</year>
          <conf-name>IEEE International Conference on Computer Vision (ICCV)</conf-name>
          <conf-date>February 18, 2016</conf-date>
          <conf-loc>Santiago, Chile</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>4489</fpage>
          <lpage>4497</lpage>
          <pub-id pub-id-type="doi">10.1109/iccv.2015.510</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jegham</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Khalifa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alouani</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Mahjoub</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Vision-based human action recognition: an overview and real world challenges</article-title>
          <source>Forensic Sci Int Digit Invest</source>
          <year>2020</year>
          <volume>32</volume>
          <fpage>200901</fpage>
          <pub-id pub-id-type="doi">10.1016/j.fsidi.2019.200901</pub-id>
          <pub-id pub-id-type="medline">38667983</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Human action recognition and prediction: a survey</article-title>
          <source>Int J Comput Vis</source>
          <year>2022</year>
          <volume>130</volume>
          <issue>5</issue>
          <fpage>1366</fpage>
          <lpage>1401</lpage>
          <pub-id pub-id-type="doi">10.1007/s11263-022-01594-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition</article-title>
          <source>AAAI</source>
          <year>2018</year>
          <month>04</month>
          <day>27</day>
          <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>
          <conf-date>February 2-7, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <pub-id pub-id-type="doi">10.1609/aaai.v32i1.12328</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Two-stream adaptive graph convolutional networks for skeleton-based action recognition</article-title>
          <year>2020</year>
          <month>01</month>
          <day>09</day>
          <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 15-20, 2019</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <pub-id pub-id-type="doi">10.1109/cvpr.2019.01230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>McKay</surname>
              <given-names>RI</given-names>
            </name>
            <name name-style="western">
              <surname>Anwar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gedeon</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Fusing higher-order features in graph neural networks for skeleton-based action recognition</article-title>
          <source>IEEE Trans Neural Netw Learning Syst</source>
          <year>2024</year>
          <volume>35</volume>
          <issue>4</issue>
          <fpage>4783</fpage>
          <lpage>4797</lpage>
          <pub-id pub-id-type="doi">10.1109/tnnls.2022.3201518</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Actional-structural graph convolutional networks for skeleton-based action recognition</article-title>
          <year>2020</year>
          <month>01</month>
          <day>09</day>
          <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>
          <conf-date>June, 15-20 2019</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>3595</fpage>
          <lpage>3603</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2019.00371</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Constructing stronger and faster baselines for skeleton-based action recognition</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2023</year>
          <volume>45</volume>
          <issue>2</issue>
          <fpage>1474</fpage>
          <lpage>1488</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2022.3157033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ke</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Towards To-a-T spatio-temporal focus for skeleton-based action recognition</article-title>
          <year>2022</year>
          <month>02</month>
          <day>04</day>
          <conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name>
          <conf-date>2022 April 29</conf-date>
          <conf-loc>NA</conf-loc>
          <fpage>1131</fpage>
          <lpage>1139</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v36i1.19998</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Si</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>An attention enhanced graph convolutional LSTM network for skeleton-based action recognition</article-title>
          <year>2020</year>
          <month>01</month>
          <day>09</day>
          <conf-name>The IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 15-20, 2019</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1227</fpage>
          <lpage>1236</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2019.00132</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A spatial attentive and temporal dilated (SATD) GCN for skeleton‐based action recognition</article-title>
          <source>CAAI Trans on Intel Tech</source>
          <year>2021</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>46</fpage>
          <lpage>55</lpage>
          <pub-id pub-id-type="doi">10.1049/cit2.12012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Action recognition based on 3D skeleton and RGB frame fusion</article-title>
          <year>2020</year>
          <month>01</month>
          <day>28</day>
          <conf-name>2019 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>
          <conf-date>November 03-08, 2019</conf-date>
          <conf-loc>Macau, China</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>258</fpage>
          <lpage>264</lpage>
          <pub-id pub-id-type="doi">10.1109/iros40897.2019.8967570</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Das</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Koperski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bremond</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Francesca</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Action recognition based on a mixture of RGB and depth based skeleton</article-title>
          <year>2017</year>
          <month>10</month>
          <day>23</day>
          <conf-name>2017 14th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS)</conf-name>
          <conf-date>August 29, 2017 to September 01, 2017</conf-date>
          <conf-loc>Lecce, Italy</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/avss.2017.8078548</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Feature fusion based human action recognition in still images</article-title>
          <source>Intl J Comput Sci Netw Secur</source>
          <year>2019</year>
          <volume>19</volume>
          <issue>11</issue>
          <fpage>151</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Human complex activity recognition with sensor data using multiple features</article-title>
          <source>IEEE Sensors J</source>
          <year>2022</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>757</fpage>
          <lpage>775</lpage>
          <pub-id pub-id-type="doi">10.1109/jsen.2021.3130913</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Improving human action recognition by jointly exploiting video and WiFi clues</article-title>
          <source>Neurocomputing</source>
          <year>2021</year>
          <volume>458</volume>
          <fpage>14</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2020.11.074</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jalal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Vision-based human activity recognition system using depth silhouettes: a smart home system for monitoring the residents</article-title>
          <source>J Electr Eng Technol</source>
          <year>2019</year>
          <volume>14</volume>
          <issue>6</issue>
          <fpage>2567</fpage>
          <lpage>2573</lpage>
          <pub-id pub-id-type="doi">10.1007/s42835-019-00278-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nagpal</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Illés</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dey</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>goldenAGER: a personalized feature fusion activity recognition model for elderly</article-title>
          <source>IEEE Access</source>
          <year>2023</year>
          <volume>11</volume>
          <fpage>56766</fpage>
          <lpage>56784</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2023.3282439</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Girdhar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Carreira</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Doersch</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zisserman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Video action transformer network</article-title>
          <year>2020</year>
          <month>01</month>
          <day>09</day>
          <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 15-20, 2019</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>244</fpage>
          <lpage>253</lpage>
          <pub-id pub-id-type="doi">10.1109/CVPR.2019.00033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qing</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zuo</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sang</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Oadtr: Online action detection with transformers</article-title>
          <year>2021</year>
          <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>
          <conf-date>October 10-17, 2021</conf-date>
          <conf-loc>Montreal, QC, Canada</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>7565</fpage>
          <lpage>7575</lpage>
          <pub-id pub-id-type="doi">10.1109/iccv48922.2021.00747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mazzia</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Angarano</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Salvetti</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Angelini</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chiaberge</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Action transformer: a self-attention model for short-time pose-based human action recognition</article-title>
          <source>Pattern Recognit</source>
          <year>2022</year>
          <month>04</month>
          <volume>124</volume>
          <fpage>108487</fpage>
          <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108487</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Plizzari</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cannici</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Matteucci</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Skeleton-based action recognition via spatial and temporal transformer networks</article-title>
          <source>Comput Vision Image Understanding</source>
          <year>2021</year>
          <month>07</month>
          <volume>208-209</volume>
          <fpage>103219</fpage>
          <pub-id pub-id-type="doi">10.1016/j.cviu.2021.103219</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>EGOFALLS: a visual-audio dataset and benchmark for fall detection using egocentric cameras</article-title>
          <year>2025</year>
          <conf-name>Proceedings of the 27th International Conference on Pattern Recognition</conf-name>
          <conf-date>December 1, 2024 to December 5, 2024</conf-date>
          <conf-loc>Kolkata, India</conf-loc>
          <publisher-loc>Switzerland</publisher-loc>
          <publisher-name>Springer Nature</publisher-name>
          <fpage>240</fpage>
          <lpage>253</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-031-78166-7_16</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Minciullo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Garattoni</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bremond</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Francesca</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Toyota Smarthome Untrimmed: Real-World Untrimmed Videos for Activity Detection</article-title>
          <source>IEEE Trans. Pattern Anal. Mach. Intell</source>
          <year>2023</year>
          <volume>45</volume>
          <issue>2</issue>
          <fpage>2533</fpage>
          <lpage>2550</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2022.3169976</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miron</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sadawi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ismail</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Grosan</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>IntelliRehabDS (IRDS)—a dataset of physical rehabilitation movements</article-title>
          <source>Data</source>
          <year>2021</year>
          <volume>6</volume>
          <issue>5</issue>
          <fpage>46</fpage>
          <pub-id pub-id-type="doi">10.3390/data6050046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ETRI-Activity3D: a large-scale RGB-D dataset for robots to recognize daily activities of the elderly</article-title>
          <year>2021</year>
          <month>02</month>
          <day>10</day>
          <conf-name>2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>
          <conf-date>October 24, 2020 to January 24, 2021</conf-date>
          <conf-loc>Las Vegas, NV, USA</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>10990</fpage>
          <lpage>10997</lpage>
          <pub-id pub-id-type="doi">10.1109/iros45743.2020.9341160</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="web">
          <source>AXIS Communications</source>
          <access-date>2025-12-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.axis.com/dam/public/7d/1f/55/datasheet-axis-m3048-p-network-camera-ja-JP-287941.pdf">https://www.axis.com/dam/public/7d/1f/55/datasheet-axis-m3048-p-network-camera-ja-JP-287941.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matsuda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Muramatsu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hayashida</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Eligibility Classification Logic of the Japanese Long Term Care Insurance</article-title>
          <source>Asian Pacific Journal of Disease Management</source>
          <year>2011</year>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>65</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.7223/apjdm.5.65</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <source>YOLOv7: Trainable Bag-of-Freebies Sets New State-of-the-Art for Real-Time Object Detectors</source>
          <year>2023</year>
          <conf-name>Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (pp. )</conf-name>
          <conf-date>22 Aug 2023</conf-date>
          <conf-loc>Vancouver, BC, Canada</conf-loc>
          <fpage>7464</fpage>
          <lpage>7475</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr52729.2023.00721</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Redmon</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ou only look once: Unified, real-time object detection</article-title>
          <year>2016</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>12 Dec 2016</conf-date>
          <conf-loc>Las Vegas, NV, USA</conf-loc>
          <fpage>779</fpage>
          <lpage>788</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2016.91</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref78">
        <label>78</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <source>Unleashing the Power of Large-Scale Unlabeled Data</source>
          <year>2024</year>
          <publisher-loc>Geneva</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>2024</fpage>
          <lpage>21</lpage>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
