For this example we are going to generate a candidate codelist for osteoarthritis, looking at the impact of alternative search strategies.
library(DBI)
library(RPostgres)
# postgres database connection details
<- Sys.getenv("server")
serverDbi <- Sys.getenv("user")
user <- Sys.getenv("password")
password <- Sys.getenv("port")
port <- Sys.getenv("host")
host
<- dbConnect(RPostgres::Postgres(),
db dbname = serverDbi,
port = port,
host = host,
user = user,
password = password
)
# name of vocabulary schema
<- "vocabulary"
vocabularyDatabaseSchema
# create cdm reference
<- CDMConnector::cdm_from_con(
cdm con = db,
cdm_schema = vocabularyDatabaseSchema,
cdm_tables = tidyselect::all_of(c(
"concept",
"concept_relationship",
"concept_ancestor",
"concept_synonym",
"drug_strength",
"vocabulary"
)) )
To start we will search for “osteoarthritis”, while excluding “post-infection” and “post-traumatic”, but without searching synonyms, without searching via non-standard codes, without fuzzy matching, and without including descendants or the direct ancestor of the included concepts.
<- getCandidateCodes(
oaCodes1 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = FALSE,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What is the candidate codelist?
datatable(oaCodes1,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 250)
) )
Now we will also include the descendants of included concepts.
<- getCandidateCodes(
oaCodes2 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = FALSE,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = TRUE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes2) %>%
newCodes1To2 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To2,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Now we will search the observation domain as well as the condition domain.
<- getCandidateCodes(
oaCodes3 cdm = cdm,
keywords = "osteoarthritis",
domains = c("Condition", "Observation"),
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = FALSE,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes3) %>%
newCodes1To3 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To3,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Now we will search the concept synonym table to identify concepts to include.
<- getCandidateCodes(
oaCodes4 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = TRUE,
searchViaSynonyms = TRUE,
searchNonStandard = FALSE,
fuzzyMatch = FALSE,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes4) %>%
newCodes1To4 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To4,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Now we will search the concept synonym table to identify concepts to include.
<- getCandidateCodes(
oaCodes5 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = TRUE,
fuzzyMatch = FALSE,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes5) %>%
newCodes1To5 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To5,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Now we will use an approximate search. See https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/agrep for further details (as this is what is running under the hood). Note, using the defaults of 0.1 for for substitutions, deletions, and insertions.
<- getCandidateCodes(
oaCodes6 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = TRUE,
maxDistanceCost = 0.1,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes6) %>%
newCodes1To6 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To6,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Here we widen the approximate searching, increasing the maximum distance allowed for a match to 0.2 for substitutions, deletions, and insertions.
<- getCandidateCodes(
oaCodes7 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = TRUE,
maxDistanceCost = 0.2,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = FALSE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes7) %>%
newCodes1To7 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To7,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )
Now we include the direct ancestor of included terms.
<- getCandidateCodes(
oaCodes8 cdm = cdm,
keywords = "osteoarthritis",
domains = "Condition",
searchInSynonyms = FALSE,
searchViaSynonyms = FALSE,
searchNonStandard = FALSE,
fuzzyMatch = FALSE,
maxDistanceCost = 0.1,
exclude = c(
"post-infection",
"post-traumatic"
),includeDescendants = FALSE,
includeAncestor = TRUE
)
What new codes do we pick up?
<- compareCodelists(oaCodes1, oaCodes8) %>%
newCodes1To8 filter(codelist == "Only codelist 2") %>%
select(-"codelist")
datatable(newCodes1To8,
rownames = FALSE,
options = list(
pageLength = 10,
lengthMenu = c(10, 20, 50)
) )