Creating YAML files

To upload phenotypes to the HDR UK Phenotpye Library using the API, the metadata for each phenotype definition needs to be compiled into individual YAML files. An example YAML file (with annotation) can be downloaded here.

We have created a script to automate the conversion of phenotype metadata (stored in an Excel file) into an individual, formatted YAML file for each phenotype. The output files are saved in the ‘phenotype_yamls’ folder.

The script can be copied below or downloaded here and the example metadata file can be downloaded here.

```r
# Libraries
library(readxl)
library(yaml)
library(stringr)

# Load (example) metadata file
df <- read_excel("example_phenotype_metadata_file.xlsx")

# Output folder
output_dir <- "phenotype_yamls"
dir.create(output_dir, showWarnings = FALSE)

# Define list columns (adjust if needed)
list_fields <- c("collections", "tags", "data_sources", "ontology")


#######################
###### Functions ######
#######################

# Function to parse multi-line text blocks into a named list
# (eg. concept_information, template)

parse_key_value_block <- function(text_block) {
  # Split into blocks by empty lines
  blocks <- str_split(text_block, "\\r?\\n\\s*\\r?\\n")[[1]]

  # Each block (chunk of key:value pairs)is a separate dictionary
  parsed_blocks <- lapply(blocks, function(block) {
    lines <- str_split(block, "\\r?\\n")[[1]]
    kv <- str_split_fixed(lines, ":", 2)
    keys <- trimws(kv[, 1])
    values <- trimws(kv[, 2])

    entry <- list()
    for (j in seq_along(keys)) {
      val <- values[j]
      if (val == "") next
      entry[[keys[j]]] <- if (grepl("^\\d+$", val)) as.integer(val) else val
    }
    return(entry)
  })

  return(parsed_blocks)
}


# Function to clean and format list-style fields

handle_list_field <- function(value, field_name) {
  entries <- str_split(trimws(value), "\\r?\\n")[[1]]
  entries <- trimws(entries)

  lapply(entries, function(x) {
    if (field_name == "publications") {
      paste0("'", gsub("'", "''", x), "'")  # Force single quotes
    } else if (grepl("^\\d+$", x)) {
      as.integer(x)
    } else {
      x
    }
  })
}


#######################
###### YAML loop ######
#######################


# Loop over rows
for (i in seq_len(nrow(df))) {
  pheno <- as.list(df[i, ])

  # Format list columns
  for (f in list_fields) {
    if (!is.null(pheno[[f]]) && !is.na(pheno[[f]]) && trimws(pheno[[f]]) != "") {
      pheno[[f]] <- handle_list_field(pheno[[f]], f)
    }
  }

  # Handle structured fields
  if (!is.na(pheno$concept_information) && trimws(pheno$concept_information) != "") {
    pheno$concept_information <- parse_key_value_block(pheno$concept_information)
  }
  if (!is.na(pheno$template) && trimws(pheno$template) != "") {
    template_list <- parse_key_value_block(pheno$template)
    if (length(template_list) > 0) {
      pheno$template <- template_list[[1]]
    }
  }

  # Drop empty/null/NA fields
  pheno <- Filter(function(x) {
    !(is.null(x) || (length(x) == 0) ||
        (length(x) == 1 && is.na(x)) ||
        (is.character(x) && all(trimws(x) == "")))
  }, pheno)

  # Convert whole numbers to integers
  pheno <- lapply(pheno, function(x) {
    if (is.numeric(x) && length(x) == 1 && x == as.integer(x)) as.integer(x) else x
  })

  # Ensure 'type' stays quoted
  if (!is.null(pheno$type)) {
    pheno$type <- as.character(pheno$type)
  }

  # Get filename from the 'name' column
  tmp_name <- gsub("[^A-Za-z0-9_-]", "_", pheno$name)
  file_name <- file.path(output_dir, paste0(tmp_name, ".yaml"))

  # Write YAML as string (preserves formatting)
  yaml_text <- as.yaml(pheno)
  writeLines(yaml_text, con = file_name)
}

```