Revalidation workflows • nfportalutils

This vignette demonstrates variants of metadata revalidation workflows.

First set up as usual.


library(nfportalutils)
syn_login(Sys.getenv("SYNAPSE_AUTH_TOKEN"))

Basics with Schematic API service

Schematic API only works with dataset folders currently. Find a dataset folder.

To validate metadata, a manifest must be reconstituted. Type ?manifest_generate to read the docs.

As seen in the params below, you do need to know the data_type to validate against. The data_type is the same as the “Component” in the schematic data model (and so the exact term depends on the data model). If feeling lucky, try infer_data_type.


my_dataset <- "syn25386362"

inferred <- infer_data_type(my_dataset)
inferred
#> $result
#> [1] "GenomicsAssayTemplate"
#> 
#> $notes
#> [1] ""

data_type <- inferred$result

manifest_generate(data_type, 
                  dataset_id = my_dataset,
                  schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
                  output_format = "google_sheet") # otherwise excel 
#> Manifest generated as Googlesheet(s)
#> [[1]]
#> [1] "https://docs.google.com/spreadsheets/d/11ymlnESzn7XhHS3vHzlRFsDPhafJJehlhuCL9HgICt8"

Go to google_sheet and download as .csv. If Excel was chosen, open in some spreadsheet editor and resave file as .csv. Then validate.

manifest_validate(data_type = data_type, 
                  file_name = "GenomicsAssayTemplate - Sheet1.csv")
#> $errors
#> $errors[[1]]
#> $errors[[1]][[1]]
#> [1] "2"
#> 
#> $errors[[1]][[2]]
#> [1] "assay"
#> 
#> $errors[[1]][[3]]
#> [1] "'' is not one of ['NIH Toolbox', 'STR profile', 'traction force microscopy', 'massively parallel reporter assay', 'gait measurement', 'conventional MRI', 'functional MRI', 'immunoassay', 'contextual conditioning behavior assay', 'genotyping', 'DNA optical mapping', 'NOMe-seq', 'Social Responsiveness Scale', 'targeted exome sequencing', '2D AlamarBlue fluorescence', 'TMT quantitation', 'liquid chromatography-electrochemical detection', 'whole genome sequencing', 'Riccardi and Ablon scales', 'cell"
#> 
#> $errors[[1]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[2]]
#> $errors[[2]][[1]]
#> [1] "2"
#> 
#> $errors[[2]][[2]]
#> [1] "specimenID"
#> 
#> $errors[[2]][[3]]
#> [1] "'' is too short"
#> 
#> $errors[[2]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[3]]
#> $errors[[3]][[1]]
#> [1] "2"
#> 
#> $errors[[3]][[2]]
#> [1] "libraryStrand"
#> 
#> $errors[[3]][[3]]
#> [1] "'' is not one of ['FirstStranded', 'Unstranded', 'Not Applicable', 'SecondStranded']"
#> 
#> $errors[[3]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[4]]
#> $errors[[4]][[1]]
#> [1] "2"
#> 
#> $errors[[4]][[2]]
#> [1] "tumorType"
#> 
#> $errors[[4]][[3]]
#> [1] "'' is not one of ['Anaplastic Ganglioglioma', 'Anaplastic Astrocytoma', 'Nodular Neurofibroma', 'Meningioma', 'Fibrosarcoma', 'Localized Neurofibroma', 'Glioblastoma', 'Malignant Peripheral Nerve Sheath Tumor', 'Anaplastic Pleomorphic Xanthoastrocytoma', 'Atypical Neurofibroma', 'tumor', 'Colorectal Adenocarcinoma', 'Recurrent MPNST', 'Pilocytic Astrocytoma', 'Ganglioglioma', 'Optic Pathway Glioma', 'Neurofibroma', 'Necrotic Neoplasm', 'Glioma', 'Teratoma', 'Cutaneous Neurofibroma', 'Fibromatosi"
#> 
#> $errors[[4]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[5]]
#> $errors[[5]][[1]]
#> [1] "2"
#> 
#> $errors[[5]][[2]]
#> [1] "libraryPreparationMethod"
#> 
#> $errors[[5]][[3]]
#> [1] "'' is not one of ['CEL-seq', 'NEBNext mRNA Library Prep Reagent Set for Illumina', '10x', 'GTAC@WUSTL in-house prep', 'KAPA mRNA HyperPrep Kit', 'TruSeq', 'unknown', 'KAPA HyperPrep Kit PCR-free', 'Illumina TruSeq DNA Nano', 'TruSeq standard total RNA library kit', 'QuantSeq FWD V2 with UDI', 'Drop-Seq', 'KAPA RNA HyperPrep Kit with RiboErase (HMR)', 'Smart-seq4', 'IDT xGen Exome Research Panel', 'Smart-seq2', 'Omni-ATAC']"
#> 
#> $errors[[5]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[6]]
#> $errors[[6]][[1]]
#> [1] "2"
#> 
#> $errors[[6]][[2]]
#> [1] "individualID"
#> 
#> $errors[[6]][[3]]
#> [1] "'' is too short"
#> 
#> $errors[[6]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[7]]
#> $errors[[7]][[1]]
#> [1] "2"
#> 
#> $errors[[7]][[2]]
#> [1] "platform"
#> 
#> $errors[[7]][[3]]
#> [1] "'' is not one of ['Illumina Genome Analyzer IIx', 'Illumina HiSeq X', 'Perlegen 300Karray', 'Vevo 3100 Imaging System', 'Illumina MouseWG-6 v2.0 expression beadchip', 'Illumina Infinium MethylationEPIC BeadChip v2.0 (935k)', 'Vectra H1 3D Imaging System', 'Nanostring Counter', 'Illumina Infinium MethylationEPIC BeadChip v1.0 (850k)', 'Illumina HumanOmniExpress-24 v1.0 BeadChip', 'Illumina HumanOmni1-Quadv1.0', 'LifeViz Micro System', 'LI-COR Odyssey CLx', 'Illumina HumanMethylation450', 'Illumin"
#> 
#> $errors[[7]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[8]]
#> $errors[[8]][[1]]
#> [1] "2"
#> 
#> $errors[[8]][[2]]
#> [1] "specimenPreparationMethod"
#> 
#> $errors[[8]][[3]]
#> [1] "'' is not one of ['FFPE', 'OCT', 'RNAlater', 'Viably frozen', 'Fresh collected', 'Cryopreserved', 'formalin-fixed', 'Flash frozen', 'ethanol']"
#> 
#> $errors[[8]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[9]]
#> $errors[[9]][[1]]
#> [1] "2"
#> 
#> $errors[[9]][[2]]
#> [1] "species"
#> 
#> $errors[[9]][[3]]
#> [1] "'' is not one of ['Rattus norvegicus', 'Gallus gallus', 'Danio rerio', 'Sus scrofa', 'Drosophila melanogaster', 'Oryctolagus cuniculus', 'Pan troglodytes', 'Rhesus macaque', 'Mus musculus (humanized)', 'Homo sapiens', 'Mus musculus']"
#> 
#> $errors[[9]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[10]]
#> $errors[[10]][[1]]
#> [1] "3"
#> 
#> $errors[[10]][[2]]
#> [1] "assay"
#> 
#> $errors[[10]][[3]]
#> [1] "'' is not one of ['NIH Toolbox', 'STR profile', 'traction force microscopy', 'massively parallel reporter assay', 'gait measurement', 'conventional MRI', 'functional MRI', 'immunoassay', 'contextual conditioning behavior assay', 'genotyping', 'DNA optical mapping', 'NOMe-seq', 'Social Responsiveness Scale', 'targeted exome sequencing', '2D AlamarBlue fluorescence', 'TMT quantitation', 'liquid chromatography-electrochemical detection', 'whole genome sequencing', 'Riccardi and Ablon scales', 'cell"
#> 
#> $errors[[10]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[11]]
#> $errors[[11]][[1]]
#> [1] "3"
#> 
#> $errors[[11]][[2]]
#> [1] "specimenID"
#> 
#> $errors[[11]][[3]]
#> [1] "'' is too short"
#> 
#> $errors[[11]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[12]]
#> $errors[[12]][[1]]
#> [1] "3"
#> 
#> $errors[[12]][[2]]
#> [1] "libraryStrand"
#> 
#> $errors[[12]][[3]]
#> [1] "'' is not one of ['FirstStranded', 'Unstranded', 'Not Applicable', 'SecondStranded']"
#> 
#> $errors[[12]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[13]]
#> $errors[[13]][[1]]
#> [1] "3"
#> 
#> $errors[[13]][[2]]
#> [1] "tumorType"
#> 
#> $errors[[13]][[3]]
#> [1] "'' is not one of ['Anaplastic Ganglioglioma', 'Anaplastic Astrocytoma', 'Nodular Neurofibroma', 'Meningioma', 'Fibrosarcoma', 'Localized Neurofibroma', 'Glioblastoma', 'Malignant Peripheral Nerve Sheath Tumor', 'Anaplastic Pleomorphic Xanthoastrocytoma', 'Atypical Neurofibroma', 'tumor', 'Colorectal Adenocarcinoma', 'Recurrent MPNST', 'Pilocytic Astrocytoma', 'Ganglioglioma', 'Optic Pathway Glioma', 'Neurofibroma', 'Necrotic Neoplasm', 'Glioma', 'Teratoma', 'Cutaneous Neurofibroma', 'Fibromatosi"
#> 
#> $errors[[13]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[14]]
#> $errors[[14]][[1]]
#> [1] "3"
#> 
#> $errors[[14]][[2]]
#> [1] "libraryPreparationMethod"
#> 
#> $errors[[14]][[3]]
#> [1] "'' is not one of ['CEL-seq', 'NEBNext mRNA Library Prep Reagent Set for Illumina', '10x', 'GTAC@WUSTL in-house prep', 'KAPA mRNA HyperPrep Kit', 'TruSeq', 'unknown', 'KAPA HyperPrep Kit PCR-free', 'Illumina TruSeq DNA Nano', 'TruSeq standard total RNA library kit', 'QuantSeq FWD V2 with UDI', 'Drop-Seq', 'KAPA RNA HyperPrep Kit with RiboErase (HMR)', 'Smart-seq4', 'IDT xGen Exome Research Panel', 'Smart-seq2', 'Omni-ATAC']"
#> 
#> $errors[[14]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[15]]
#> $errors[[15]][[1]]
#> [1] "3"
#> 
#> $errors[[15]][[2]]
#> [1] "individualID"
#> 
#> $errors[[15]][[3]]
#> [1] "'' is too short"
#> 
#> $errors[[15]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[16]]
#> $errors[[16]][[1]]
#> [1] "3"
#> 
#> $errors[[16]][[2]]
#> [1] "platform"
#> 
#> $errors[[16]][[3]]
#> [1] "'' is not one of ['Illumina Genome Analyzer IIx', 'Illumina HiSeq X', 'Perlegen 300Karray', 'Vevo 3100 Imaging System', 'Illumina MouseWG-6 v2.0 expression beadchip', 'Illumina Infinium MethylationEPIC BeadChip v2.0 (935k)', 'Vectra H1 3D Imaging System', 'Nanostring Counter', 'Illumina Infinium MethylationEPIC BeadChip v1.0 (850k)', 'Illumina HumanOmniExpress-24 v1.0 BeadChip', 'Illumina HumanOmni1-Quadv1.0', 'LifeViz Micro System', 'LI-COR Odyssey CLx', 'Illumina HumanMethylation450', 'Illumin"
#> 
#> $errors[[16]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[17]]
#> $errors[[17]][[1]]
#> [1] "3"
#> 
#> $errors[[17]][[2]]
#> [1] "specimenPreparationMethod"
#> 
#> $errors[[17]][[3]]
#> [1] "'' is not one of ['FFPE', 'OCT', 'RNAlater', 'Viably frozen', 'Fresh collected', 'Cryopreserved', 'formalin-fixed', 'Flash frozen', 'ethanol']"
#> 
#> $errors[[17]][[4]]
#> [1] ""
#> 
#> 
#> $errors[[18]]
#> $errors[[18]][[1]]
#> [1] "3"
#> 
#> $errors[[18]][[2]]
#> [1] "species"
#> 
#> $errors[[18]][[3]]
#> [1] "'' is not one of ['Rattus norvegicus', 'Gallus gallus', 'Danio rerio', 'Sus scrofa', 'Drosophila melanogaster', 'Oryctolagus cuniculus', 'Pan troglodytes', 'Rhesus macaque', 'Mus musculus (humanized)', 'Homo sapiens', 'Mus musculus']"
#> 
#> $errors[[18]][[4]]
#> [1] ""
#> 
#> 
#> 
#> $warnings
#> $warnings[[1]]
#> $warnings[[1]][[1]]
#> [1] "2"
#> 
#> $warnings[[1]][[2]]
#> [1] "age"
#> 
#> $warnings[[1]][[3]]
#> [1] "On row 2 the attribute age does not contain the proper value type num."
#> 
#> $warnings[[1]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[2]]
#> $warnings[[2]][[1]]
#> [1] "3"
#> 
#> $warnings[[2]][[2]]
#> [1] "age"
#> 
#> $warnings[[2]][[3]]
#> [1] "On row 3 the attribute age does not contain the proper value type num."
#> 
#> $warnings[[2]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[3]]
#> $warnings[[3]][[1]]
#> $warnings[[3]][[1]][[1]]
#> [1] "2"
#> 
#> $warnings[[3]][[1]][[2]]
#> [1] "3"
#> 
#> 
#> $warnings[[3]][[2]]
#> [1] "readPair"
#> 
#> $warnings[[3]][[3]]
#> [1] "readPair values in rows ['2', '3'] are out of the specified range."
#> 
#> $warnings[[3]][[4]]
#> $warnings[[3]][[4]][[1]]
#> [1] ""
#> 
#> 
#> 
#> $warnings[[4]]
#> $warnings[[4]][[1]]
#> [1] "2"
#> 
#> $warnings[[4]][[2]]
#> [1] "readLength"
#> 
#> $warnings[[4]][[3]]
#> [1] "On row 2 the attribute readLength does not contain the proper value type int."
#> 
#> $warnings[[4]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[5]]
#> $warnings[[5]][[1]]
#> [1] "3"
#> 
#> $warnings[[5]][[2]]
#> [1] "readLength"
#> 
#> $warnings[[5]][[3]]
#> [1] "On row 3 the attribute readLength does not contain the proper value type int."
#> 
#> $warnings[[5]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[6]]
#> $warnings[[6]][[1]]
#> [1] "2"
#> 
#> $warnings[[6]][[2]]
#> [1] "readDepth"
#> 
#> $warnings[[6]][[3]]
#> [1] "On row 2 the attribute readDepth does not contain the proper value type int."
#> 
#> $warnings[[6]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[7]]
#> $warnings[[7]][[1]]
#> [1] "3"
#> 
#> $warnings[[7]][[2]]
#> [1] "readDepth"
#> 
#> $warnings[[7]][[3]]
#> [1] "On row 3 the attribute readDepth does not contain the proper value type int."
#> 
#> $warnings[[7]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[8]]
#> $warnings[[8]][[1]]
#> [1] "2"
#> 
#> $warnings[[8]][[2]]
#> [1] "experimentalTimepoint"
#> 
#> $warnings[[8]][[3]]
#> [1] "On row 2 the attribute experimentalTimepoint does not contain the proper value type num."
#> 
#> $warnings[[8]][[4]]
#> [1] ""
#> 
#> 
#> $warnings[[9]]
#> $warnings[[9]][[1]]
#> [1] "3"
#> 
#> $warnings[[9]][[2]]
#> [1] "experimentalTimepoint"
#> 
#> $warnings[[9]][[3]]
#> [1] "On row 3 the attribute experimentalTimepoint does not contain the proper value type num."
#> 
#> $warnings[[9]][[4]]
#> [1] ""

Make corrections in the .csv according to validation laundry list.
Submit corrected manifest via DCA.

Alternative with dataset entity

For working with dataset entities or doing extra checks, this slightly different workflow can be applied instead. Dataset entities may be more complicated because they can combine files in different times and places (batches).

Generate a manifest for Synapse dataset entity. We need to use remanifest; to understand differences, read the docs by running ?remanifest.

datasets <- list_project_datasets(project_id = "syn4939902", type = "dataset")
new_datasets <- Filter(function(d) as.Date(d$createdOn) > as.Date("2023-12-01"), datasets) # or filter by name

We will use the first item as the demo.

test <- new_datasets[[1]]$id
remanifest(test, file = "manifest_rd1.csv")
#> ✔️ Saved manifest as manifest_rd1.csv

For reproducibility this original manifest “manifest_rd1.csv” is in the vignettes folder.

Run precheck.

precheck_manifest("manifest_rd1.csv")
#> ❌ Multiple components detected in a single manifest: 'RNASeqTemplate', 'GenomicsAssayTemplate', ''. This can happen when files were annotated at different eras.
#> Suggestions: 1) Split up the manifest because schematic can only validate one type at a type. 2) Harmonize the components if this is sensible.
#> For example, RNASeqTemplate is an alias for GenomicsAssayTemplate
#> ❌ Blank value '' for Component detected. This can happen because files were annotated before 2022, when Component was introduced for most DCCs.
#> ❌ The pattern of these attribute names suggest duplicates: '...1', '...44', '...46'. This may happen when metadata is supplemented programmatically with a data-type mismatch
#> ⚠️ An attribute `Uuid` is present and should preferably be removed. See issue # .
#> ⚠️ An attribute `eTag` is present and preferably be removed.
#> ℹ️ Custom attributes (not documented in data model) were found: 'entityId', '...1', 'Uuid', 'DNA_ID', 'RNA_ID', 'tissue', 'bodyPart', 'parentSpecimenId', 'eTag', '...44', '...46', 'accessTeam', 'accessType', 'sciDataRelease', 'specimenIdSource', 'timePointUnit', 'transplantationDonorTissue', 'transplantationDonorSpecies'. In general, custom attributes added by the researcher to help with data management are fine.
#> Just check that they are not PHI or added by mistake. If they are deemed generally useful or important enough, they can also be documented officially in the data model for others to reference.

Make a copy of “manifest_rd1.csv”, e.g. “manifest_rd1_corrected.csv” and use precheck notes to help make corrections.
Validate “manifest_rd1_corrected.csv” using schematic service (same as above, not run since general validation output has already been shown).
Make more corrections as needed.
Finally submit corrected manifest.