@Article{info:doi/10.2196/65178, author="West, Matthew and Cheng, You and He, Yingnan and Leng, Yu and Magdamo, Colin and Hyman, Bradley T and Dickson, John R and Serrano-Pozo, Alberto and Blacker, Deborah and Das, Sudeshna", title="Unsupervised Deep Learning of Electronic Health Records to Characterize Heterogeneity Across Alzheimer Disease and Related Dementias: Cross-Sectional Study", journal="JMIR Aging", year="2025", month="Mar", day="31", volume="8", pages="e65178", keywords="Alzheimer disease and related dementias; electronic health records; large language models; clustering; unsupervised learning", abstract="Background: Alzheimer disease and related dementias (ADRD) exhibit prominent heterogeneity. Identifying clinically meaningful ADRD subtypes is essential for tailoring treatments to specific patient phenotypes. Objective: We aimed to use unsupervised learning techniques on electronic health records (EHRs) from memory clinic patients to identify ADRD subtypes. Methods: We used pretrained embeddings of non-ADRD diagnosis codes (International Classification of Diseases, Ninth Revision) and large language model (LLM)--derived embeddings of clinical notes from patient EHRs. Hierarchical clustering of these embeddings was used to identify ADRD subtypes. Clusters were characterized regarding their demographic and clinical features. Results: We analyzed a cohort of 3454 patients with ADRD from a memory clinic at Massachusetts General Hospital, each with a specialist diagnosis. Clustering pretrained embeddings of the non-ADRD diagnosis codes in patient EHRs revealed the following 3 patient subtypes: one with skin conditions, another with psychiatric disorders and an earlier age of onset, and a third with diabetes complications. Similarly, using LLM-derived embeddings of clinical notes, we identified 3 subtypes of patients as follows: one with psychiatric manifestations and higher prevalence of female participants (prevalence ratio: 1.59), another with cardiovascular and motor problems and higher prevalence of male participants (prevalence ratio: 1.75), and a third one with geriatric health disorders. Notably, we observed significant overlap between clusters from both data modalities ($\chi$24=89.4; P<.001). Conclusions: By integrating International Classification of Diseases, Ninth Revision codes and LLM-derived embeddings, our analysis delineated 2 distinct ADRD subtypes with sex-specific comorbid and clinical presentations, offering insights for potential precision medicine approaches. ", issn="2561-7605", doi="10.2196/65178", url="https://aging.jmir.org/2025/1/e65178", url="https://doi.org/10.2196/65178" }