Swecris/Case/Cordis/OpenAire Data Assessment

Author
Affiliation
Mohamad Bazzi

KTH Royal Institute of Technology

Text blurb

Rationale — To enhance existing information in CASE by supplementing missing or incomplete data.

Code
R.packages <- c("devtools","dplyr","swecris","kthapi","grid",
                "gridExtra","stringdist","knitr","quarto","ggfittext","xml2",
                "ReDaMoR","gt","janitor","ggplot2","ggpubr","purrr","XML",
                "RecordLinkage","flextable","colorspace","fastLink",
                "magrittr","kableExtra","VennDiagram","cld2","PGRdup",
                "fuzzyjoin","naniar","furrr","stringr","qpcR","dlookr",
                "cordis","bibliotools","OpenAIRE","duckdb","ggalluvial","gapminder",
                "circlize","networkD3","wordcloud","ggwordcloud","RColorBrewer")

invisible(lapply(R.packages, library, character.only = TRUE))
Code
source(file = "/swecris/data-raw/customFunction.R")

Data sources

Code
# Download all sweCris funding data.
swe.Total <- swecris_fundings() # 223 737 rows.

# sweCris data (KTH only: 3137 observations as of 2023-02-16)
sw.Df <- swecris_kth

# CASE data (2593 observations)
ca.Df <- bibliotools::case()

# Adjust column names.
colnames(ca.Df)[16] <- "ProjectNumber"
colnames(ca.Df)[14] <- "PrimaryResearcher"
colnames(ca.Df)[6] <- "FundingOrganisation"
Code
include_graphics("/swecris/analytic img/Old Images/Figure-1.png")

Figure 1. Data-driven flowchart (matching keys).

Code
include_graphics("/swecris/analytic img/Old Images/Figure-2.png")

Figure 2. Entity/Relationship (ER) diagram.

Code
db_connections <- read.csv2(file = "/swecris/alluvialGraph.csv",header = T)
# Plot. 
# chordDiagram(db_connections,
#              col = cols,
#             transparency = 0.1,
#             link.lwd = 2,
#             link.lty = 2)

# circos.clear()

CASE data-quality

Code
ca_figure_a <- bazzi.na.Total(x = ca.Df, col = "black",typographic = T) + 
  theme(axis.text.x = element_text(size = 9,angle = 45),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9), 
        axis.title.y.right = element_text(size = 9),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

CASE data-quality by school

Code
# School data-frame.
ca.school <- ca.Df
ca.school$school_short <- as.factor(ca.school$school_short)
# Split data-frame by factor.
school <- ca.school %>% 
  group_by(school_short)
# Droplevels, set names, and plot.
split.school <- group_split(school)

for(i in seq_along(split.school)) {
  split.school[[i]]$school_short <- droplevels(split.school[[i]]$school_short)
}

schoolName <- c("ABE","CBH","EECS","ITM","SCI")

ls.plot <- list()

for(i in seq_along(split.school)) {
  out <- split.school[[i]] %>% 
    bazzi.na.Partial(only_na = TRUE,col = "blue",typographic = T,
                   main = schoolName[i]) +
    theme(axis.text.x = element_text(size = 5),
          axis.text.y = element_text(size = 5),
          axis.title.x = element_text(size = 5),
          axis.title.y = element_text(size = 5),
          axis.title.y.right = element_text(size = NULL),
          plot.title = element_text(size=7),
          legend.title = element_text(size = 6),
          legend.text = element_text(size = 6),
          legend.key.size = unit(1,"mm"),
          legend.key.height= unit(1, 'mm'),
          legend.key.width= unit(1, 'mm'),
          plot.margin=unit(c(-0.5,1,1,1), "mm"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    coord_flip()
  ls.plot[[i]] <- out
}

do.call("grid.arrange", c(ls.plot, ncol=3))

Figure 4. Pareto chart for missing values by school in CASE. Only variables with missing values are included.

On a school level, ABE has 17 fields with a missingness rate of more than 50%. CBH has 18 fields with a missingness of more than 50%. EECS and SCI schools both have 25 fields with missingness > 50%. ITM has only 13 fields with missingness greater than 50%. In total, 28 fields were missing for NA.

sweCris data-quality

Code
# Convert all Unknown to NA.
sw.Df[,] <- apply(sw.Df, 2, function(x) ifelse(x == "Unknown", NA, x))
# Plot.
bazzi.na.Total(x = sw.Df, col = "black",typographic = T) +
  theme(axis.text.x = element_text(size = 7),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9), 
        axis.title.y.right = element_text(size = 9))

Figure 5. Pareto chart for missing values (counts and proportions) in sweCris.

In the sweCris dataset, the column-wise completeness is generally satisfactory. The only notable exceptions are the abstract fields in both Swedish and English, which tend to have a high prevalence of missing data. Additionally, the project title in English is not consistently recorded with a high degree of accuracy. The involved people field is also not 100% complete.

What does sweCris contain not present in Case?

Range

Code
range(ca.Df$beg,na.rm = T) # Case - 2009
[1] "2009-03-01" "2024-03-20"
Code
range(sw.Df$FundingStartDate) # sweCris - 2008
[1] "2008-01-01" "2023-01-01"
Code
# Number of Research Projects by Funding Start Year?
ca.Df["startYear"] <- format(ca.Df$beg,"%Y")
sw.Df["startYear"] <- format(as.Date(sw.Df$FundingStartDate),"%Y")
# Check Case.
k_1 <- ca.Df %>% group_by(type,startYear) %>% 
  filter(type == "Research project") %>%
  count(sort = F) %>%
  ggplot2::ggplot(mapping = aes(x = startYear,y = n)) +
  labs(title = "Case: Research Projects") +
  geom_col() + theme(axis.text.x = element_text(angle = 45))
# Check sweCris
k_2 <- sw.Df %>% group_by(startYear) %>%
  count(sort = F) %>%
  ggplot2::ggplot(mapping = aes(x = startYear,y = n)) +
  labs(title = "sweCris") +
  geom_col() + theme(axis.text.x = element_text(angle = 45))

ggpubr::ggarrange(k_1,k_2) + theme(aspect.ratio = 1/3)

Code
## More sweCris graphics.
# Size of funding in sweCris along with the average.
sw_fund <- sw.Df %>% mutate(FundingsSek = as.numeric(FundingsSek)) %>%
  group_by(FundingsSek,startYear) %>%
  ggplot(mapping = aes(x = startYear, y = FundingsSek)) + 
  scale_y_continuous(labels = scales::comma) +
  geom_hline(yintercept=mean(as.numeric(sw.Df$FundingsSek)), linetype="dashed", 
                color = "red", size=1) +
  geom_point()
  
# Type of Award.
sw_award <- sw.Df %>% group_by(TypeOfAwardDescrEn) %>% count(sort = T) %>%
  ggplot(aes(x = reorder(TypeOfAwardDescrEn,-n), y = n)) +
  geom_col() + theme(axis.text.x = element_text(angle = 45, size = 6, vjust = .5)) +
  labs(x = "", title = "Type of Award", y = "") + coord_flip()

1A. Project number (Variable identifier)

  • Exact matches on ID

The Project number variable in CASE correspond to the Project ID field in sweCris. However, to facilitate string intersection or matching, one is required to reformat the key in one of the datasets. A cursory view at the sweCris dataset reveal a predominantly consistent ID form/style. For that reason, I have chosen to use the sweCris ID format as a baseline for how to ‘clean’ CASE project numbers.

Exploratory data analysis reveal that the ID format in both datasets are organisation-specific (e.g. projects funded via the Swedish Energy Agency takes a unique form: see below). In the code that follows I have split the ‘cleaning’ procedure into first standardizing the Swedish Energy Agency funded project ID’s (including the removal of empty values , non-numeric characters, and the addition of letters at the beginning and end of each ID), and then applying a similar set of operations to the remaining project data. More specifically, I created a new field corresponding to a ‘clean ID’ with only numeric characters and hyphen symbols retained. This was then filtered to keep elements with 10 characters only. The chosen value is not arbitrary, but is a reflection of the number of elements found in the majority-class of ID’s in sweCris.

Code
# Subset CASE by Funding Organisation = Swedish Energy Agency.
sea.ca <- subset(ca.Df, FundingOrganisation == "Energimyndigheten")
# Remove NA and superfluous text.
sea.ca <- sea.ca[!is.na(sea.ca$ProjectNumber),]
sea.ca <- sea.ca[sea.ca$ProjectNumber %in% as.vector(na.omit(sea.ca$ProjectNumber[sapply(sea.ca$ProjectNumber, nchar) == 7])),]
# Add P to number strings.
sea.ca$ProjectNumber <- paste0("P", gsub("","",sea.ca$ProjectNumber))
# Add organisation name to string.
sea.ca$ProjectNumber <- paste(sea.ca$ProjectNumber,"_Energi")
sea.ca$ProjectNumber <- gsub("\\s","", sea.ca$ProjectNumber)
# Repeat: Keep numbers only.
ca.Df["clean.IDA"] <- gsub("[^0-9.-]","",ca.Df$ProjectNumber)
# Keep 10 character elements only.
ca.Df.sub <- ca.Df[ca.Df$clean.IDA %in% as.vector(na.omit(ca.Df$clean.IDA[sapply(ca.Df$clean.IDA, nchar) == 10])),] # 312 observations.
ca.Df.sub$clean.IDA <- gsub("^-","",ca.Df.sub$clean.IDA)
# Remove empty funding organisation.
ca.Df.sub$FundingOrganisation[ca.Df.sub$FundingOrganisation == "Not Applicable N/A"]  <- ""
# Standardize against sweCris names.
ca.Df.sub$FundingOrganisation <- recode(ca.Df.sub$FundingOrganisation,
                                        Energimyndigheten = "Energi",
                                        VINNOVA = "Vinnova",
                                        Vetenskapsrådet = "VR")

# Combine
ca.Df.sub["clean.IDA"] <- paste(ca.Df.sub$clean.IDA,
                                ca.Df.sub$FundingOrganisation,
                                sep = "_")

ca.Df.sub["clean.IDA"] <- gsub("^-","",ca.Df.sub$clean.IDA)

# Combine dataframes.
cleanID.CASE <- dplyr::bind_rows(sea.ca,ca.Df.sub) # 389 observations.
cleanID.CASE[1:77,"clean.IDA"] <- cleanID.CASE[1:77,"ProjectNumber"]
# Remove missing organisations.
cleanID.CASE <- cleanID.CASE[!cleanID.CASE$FundingOrganisation == "",] # 386 observations.
Code
# How many in 00000-0_ZZ shape in sweCris? They are all Energy funded 
as.vector(na.omit(str_extract(sw.Df$ProjectId, "(\\d+)-(\\d{1})_[:alpha:]+"))) %>%
  length() # 369 (11.76%)
# How many in 0000-00000_ZZ format?
as.vector(na.omit(str_extract(sw.Df$ProjectId, "(\\d+)-(\\d{5})_[:alpha:]+"))) %>%
  length() # 2680 (85%). In total with the aforementioned = 97.19%

Duplicates in CASE?

In total, there are 20 instances of duplicate project titles in CASE at the level research projects. These duplicate projects can be identified by their distinct project IDs and varying statuses, such as application, ongoing, or rejected.

Code
# Title duplicates in title name.
# ca.Df[duplicated(ca.Df["Name"], fromLast = T), ] %>% filter(!is.na(Name))

# Title duplicates in title names for research projects.
res_proj <- ca.Df[ca.Df$type == "Research project", ]
# na.omit(res_proj$Name[duplicated(res_proj$Name)]) %>% length
# Examine the projects with n > 1.
res_proj_dup <- 
  ca.Df[ca.Df$Name %in% (res_proj$Name[duplicated(res_proj$Name)])[-c(1:2)],] %>%
  filter(type != "Research Infrastructure" & type != "Research Centra")
# Plot results.
ca_figure_b <- res_proj_dup %>%
  ggplot(mapping = aes(x = `Project ID`, fill = Status)) +
  geom_bar() + facet_wrap(.~Name+FundingOrganisation, scales = "free")

Pattern expressions (Project Number)

At the character level there are 37 levels. Each level (e.g., “10” may include several ID formats). At least 9 IDs are either duplicates or triplets. Interestingly, the duplicated project numbers are not the same projects as indicated by different project titles.

Code
# How many different patterns are available in the Project Number field?
ids <- na.omit(ca.Df$ProjectNumber)
# Data frame.
ids.nchar <- data.frame(ids = ids,
                        char.num = nchar(ids),
                        name = ca.Df[ca.Df$ProjectNumber %in% ids,]$Name) # 1080 projects.

# At the character unit there are 37 levels.
# Each level (e.g., "10" may include several ID pattern expressions)
ids_expression <- ids.nchar %>% 
  dplyr::group_by(char.num) %>% count(sort = T) %>% print(n = 37)
# A tibble: 38 × 2
# Groups:   char.num [38]
   char.num     n
      <int> <int>
 1       10   303
 2        6   231
 3        9   157
 4        7    88
 5       11    70
 6        5    49
 7        8    42
 8       12    17
 9       13    16
10       18    14
11       14    12
12       24    12
13       15     8
14       20     8
15       19     6
16       23     6
17       37     6
18       16     5
19       17     5
20        4     4
21       21     3
22       32     3
23       25     2
24       26     2
25       27     2
26       28     2
27       34     2
28       38     2
29       69     2
30        3     1
31       22     1
32       29     1
33       30     1
34       31     1
35       36     1
36       51     1
37       56     1
# … with 1 more row
Code
# Extract the values for a specific char.num
values_for_char_num <- ids.nchar %>% 
  filter(char.num == 10) %>% 
  pull(ids); attributes(values_for_char_num)$na.action <- NULL

# Plot.
ca_figure_c <- ids_expression %>%
  ggplot(mapping = aes(x = as.factor(char.num),y = n)) +
  geom_col() + theme(aspect.ratio = 1/2)

# Plot project types in Case.
ca_figure_d <- ca.Df %>% group_by(type) %>% count(sort = T) %>%
  ggplot(mapping = aes(x = reorder(type, -n),y = n)) +
  geom_col() + coord_flip() + theme(aspect.ratio = 1/2)
Code
# How many true project ID's are duplicated?
true_dup <- ids.nchar %>%
  dplyr::select(-starts_with("name")) %>%
  add_count(ids) %>%
  filter(n > 1) %>%
  distinct() # 8 in total.

# Are these the same projects or are they completely different?
ids.nchar %>%
  group_by(ids) %>%
  filter(n()>1) %>%
  ungroup() %>% arrange(desc(ids))

NB: This shows that the duplicated ID’s in Case correspond to different project titles at the level of research projects.

Projects by funding agency

Code
f1 <- ca.Df %>% group_by(FundingOrganisation) %>% count(sort = T) %>%
  ggplot(mapping = aes(x = reorder(FundingOrganisation, -n), y = n)) +
  geom_col() + labs(x = "", title = "Case") +
  annotate("rect", xmin = 1.5, xmax = 2.5, ymin = -1, ymax = 600,
             alpha = 0, color= "green") + coord_flip()

f2 <- sw.Df %>% group_by(FundingOrganisationNameEn) %>% count(sort = T) %>%
  ggplot(mapping = aes(x = reorder(FundingOrganisationNameEn, -n), y = n)) +
  geom_col() + coord_flip() + labs(x = "", title = "sweCris")

grid.arrange(f1,f2, ncol = 2)

Figure 8. Projects by funding agency

Code
# Table 1.
Tab.1 <- ca.Df %>%
  group_by(FundingOrganisation) %>%
  count(sort = F) # 29 funding organisations in Case.
# Table 2.
Tab.2 <- sw.Df %>%
  group_by(FundingOrganisationNameSv) %>%
  count(sort = F) # 7 in sweCris.

# Table 3 (Hypothetical matches)
Fund.Org <- c("VR","VINNOVA","Formas","Energimyndigheten",
              "Rymdstyrelsen","Forte","Riksbankens Jubileumsfond")

Tab.3 <- cbind(Tab.2,Tab.1[Tab.1$FundingOrganisation %in% Fund.Org, ])

Tab.3[,3:4] <- Tab.3[c(2,3,4,5,1,7,6),3:4]

colnames(Tab.3)[1] <- "sweCris"
colnames(Tab.3)[2] <- "Sample"
colnames(Tab.3)[3] <- "CASE"
colnames(Tab.3)[4] <- "Sample"

# 755 Projects.
sum(Tab.1[Tab.1$FundingOrganisation %in% Fund.Org, ]$n)
Code
# Change Not Applicable N/A to true NA.
ca.Df$FundingOrganisation[ca.Df$FundingOrganisation == "Not Applicable N/A "] <- NA
Code
knitr::kable(x = Tab.3,
caption = "Table 1. Agencies involved in project funding in Case. The funding organization is unknown for 577 CASE projects. SweCris has seven funding agencies and CASE has 29. The hypothetical project match is 755. The question that follows is how many projects can we match using available identifiers?") %>%
  kable_styling() 
Table 1. Agencies involved in project funding in Case. The funding organization is unknown for 577 CASE projects. SweCris has seven funding agencies and CASE has 29. The hypothetical project match is 755. The question that follows is how many projects can we match using available identifiers?
sweCris Sample CASE Sample
Formas 407 Formas 93
Forte 13 Forte 8
Riksbankens Jubileumsfond 12 Riksbankens Jubileumsfond 6
Rymdstyrelsen 74 Rymdstyrelsen 7
Statens energimyndighet 371 Energimyndigheten 206
Vetenskapsrådet 1559 VR 116
Vinnova 701 VINNOVA 330

Q1. Viewed from a funding organisation perspective, what is the degree of overlap between projects in Case and Swecris? What options are there to ameliorate large gaps or non-overlapping data. Does one data source lend itself better for a given purpose? For example, in the sweCris data there is plenty of information about funding amounts awarded to any given project, whereas, in Case this field is notably depauperate or simply missing at a rate of 93%.


Remarks including recommendations

  1. The project number field in CASE would benefit from a re-structuring and consistent ID formatting across projects. This would make database comparisons much easier. Equally, the high frequency of NA’s (n=1531) in the Project Number column, along with at least 8 true duplicates and one triplet constitutes an additional data-quality issue in CASE. Notably, these duplicated IDs do not correspond to duplicated project names. This means that a few projects in CASE have similar IDs.

  2. Furthermore, project ID’s associated with the Swedish Energy Agency often lack the initial P letter. Additionally, many projects lack an ID in entirety, some have more than one ID along with text, and others are made up by text only. This makes this field very unstructured. (see remarks above).

  3. Other data-set pathologies in CASE include high proportion of missingness and projects with similar project titles (i.e. duplicates: see above).

  4. ID cleaning was done using a three-step process:

  • Removal of all non-numeric characters.
  • Retaining all 9 digits + hyphen elements (this is the conventional label shape in sweCris: also see note above).
  • Add funding organisation to CASE ID.
Code
sweCris.format <- data.frame(Format = c("P00000-0_ZZ","0000-00000_ZZ",
                                        "000/00_ZZ","00/00_ZZ"),
                             Percentage = c("11.76","85.0","1.43","1.88"),
                             N = c(369,2680,45,59),
                             Remarks = c("Swedish Energy Agency affiliated",
                                         "VR, Vinnova, & Formas",
                                         "SNSB affiliated","SNSB affiliated"))
Code
knitr::kable(x = sweCris.format,
             caption = "Table 2. There are at least eight different ID styles/formats in sweCris. Together, 0000-00000_ZZ and P00000-0_ZZ make up ~97% of all ID patterns in sweCris.") %>% 
  kable_styling() %>%
  kableExtra::add_footnote(label = "Not all ID patterns are shown")
Table 2. There are at least eight different ID styles/formats in sweCris. Together, 0000-00000_ZZ and P00000-0_ZZ make up ~97% of all ID patterns in sweCris.
Format Percentage N Remarks
P00000-0_ZZ 11.76 369 Swedish Energy Agency affiliated
0000-00000_ZZ 85.0 2680 VR, Vinnova, & Formas
000/00_ZZ 1.43 45 SNSB affiliated
00/00_ZZ 1.88 59 SNSB affiliated
a Not all ID patterns are shown
Code
# Subset CASE data (both NA's and duplicates in Project Number are removed)
cleanID.CASE <- cleanID.CASE %>% filter(!duplicated(clean.IDA)) %>%
  filter(!duplicated(Name)) # 384 projects.

# Confirm. Looks fine.
which(duplicated(x = cleanID.CASE$Name) == T)
which(duplicated(x = cleanID.CASE$clean.IDA) == T)

# How many ID's match?
# 139 projects match. This approach might be to stringent.
generics::intersect(cleanID.CASE$clean.IDA,sw.Df$ProjectId) %>% length()

# This will add information to those project that match between the datasets.
# Adds 27-1 more variables. Check their level of completeness.
Match.1 <- cleanID.CASE %>% 
  left_join(sw.Df, by = c("clean.IDA" = "ProjectId"))

# Parse out the projects that matched: this is equivalent to an inner join.
MissCheck.1 <- Match.1[Match.1$clean.IDA %in% 
                         generics::intersect(cleanID.CASE$clean.IDA,
                                             sw.Df$ProjectId), ]
# Convert into NA.
# MissCheck.1 <- MissCheck.1 %>% 
#  mutate(across(c("ProjectTitleSv","ProjectTitleEn",
#                  "ProjectAbstractSv","ProjectAbstractEn"), na_if, "Unknown"))

# Project matches based on the ID identifier are assoicated 4 funding organizations.
# This leaves out projects funded by Forte, Riksbankens Jubileumsfond, and Rymdstyrelsen.
MissCheck.1 %>%
  group_by(FundingOrganisation) %>%
  summarise(n = n()) %>%
  arrange(-n)

# Degree of missingness?
sapply(MissCheck.1[,1:40], function(x) sum(is.na(x)))
sapply(MissCheck.1[,41:66], function(x) sum(is.na(x)))

Data quality after matching

Code
bazzi.na.Total(x = MissCheck.1, col = "black",typographic = T) + 
  theme(axis.text.x = element_text(size = 7),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9), 
        axis.title.y.right = element_text(size = 9))

Figure 7. Pareto chart for missing values (counts and proportions) in the reduced CASE dataset (i.e. the remaining projects that satisfies the matching criterion using clean ID’s).

Results (1/3)

  1. The number of project matches depend on how ID standardization is done.
  2. Using the two predominant ID styles in sweCris (see above) as a reference, yields 139 distinct project matches. Generally, most CASE ID’s are inconsistently written, which makes it very difficult to merge against sweCris. The issue is further exacerbated by variable ID formats in sweCris. But is generally better than CASE.
  3. Of the 384 CASE projects with a clean ID, only ~36% (N=139) could be matched against the sweCris dataset.
  4. Viewed altogether, only 5.32% of unique CASE projects (N=2611) could be detected in sweCris using project ID as an identifier. In terms of ‘hypothetical matches’ (see Table 1) , the matching level rise to ~18%.
  5. Overall, despite the low number of project matches, the added information retrieved via sweCris is rather complete (column-wise). This is unsurprising as the data quality in sweCris tends to be better.

Code
# Venn diagram showing project overlap based on project ID's.
mP1 <- draw.pairwise.venn(area1= 3137,area2 = 746,
                          cross.area = 139,
                          category = c("sweCris projects with an ID",
                                       "CASE projects with an ID"),
                          fill = c("grey80","#7b0323"),
                          scaled = T,cex = 1,
                          rotation.degree = 45)
Code
ggarrange(mP1) + theme(aspect.ratio = 1)

Figure 8. Venn diagram showing project overlap based on project ID’s. Area 1 (Grey) = sweCris projects limited to funding organizations shown in Table 1. Area 2 (Red) = CASE projects.

Code
cleanID.CASE %>% group_by(Role) %>% count(sort = T)

1B. Key: Project number

In-exact matches on ID

Code
# Fuzzy matching.
Fuzz.ID <- cleanID.CASE %>%
  stringdist_inner_join(x = sw.Df,by = c(ProjectId = "clean.IDA")) %>%
  filter(!duplicated(clean.IDA)) %>%
  filter(!duplicated(Name)) # 332 matches?

Fuzz.ID["lev"] <- lev <- levenshteinSim(Fuzz.ID$clean.IDA,Fuzz.ID$ProjectId)
Fuzz.ID["jaro"] <- jaro <- jarowinkler(Fuzz.ID$clean.IDA,Fuzz.ID$ProjectId)

Evaluating fuzzy matching

Code
knitr::kable(slice_head(Fuzz.ID[,c("clean.IDA","ProjectId","lev","jaro")],n = 10),
  caption = "Table 3. Fuzzy matching. Lev = levenshtein distance (%), jaro = jaro-winkler distance.",
  digits = 2) %>%
  kable_styling()
Table 3. Fuzzy matching. Lev = levenshtein distance (%), jaro = jaro-winkler distance.
clean.IDA ProjectId lev jaro
2022-04907_VR 2022-00901_VR 0.85 0.92
2022-04900_VR 2022-00901_VR 0.85 0.95
2022-01905_VR 2022-00901_VR 0.85 0.95
2022-06078_VR 2022-01079_VR 0.85 0.94
2022-01284_Vinnova 2022-01624_Vinnova 0.89 0.98
2022-01725_Vinnova 2022-01624_Vinnova 0.89 0.96
2022-01004_Vinnova 2022-01624_Vinnova 0.89 0.96
2022-01614_Vinnova 2022-01624_Vinnova 0.94 0.98
2022-01240_Vinnova 2022-01624_Vinnova 0.89 0.98
2022-01600_Vinnova 2022-01624_Vinnova 0.89 0.96

Miss-classification rate

Code
paste(round(100-(which(Fuzz.ID$lev == 1.0000000) %>% 
                   length()/(length(Fuzz.ID$lev)))*100,2),"%")
[1] "90.7 %"

2. Key: Project Title.

Code
# How many NA project titles are there in sweCris?
sum(is.na(sw.Df$ProjectTitleSv)) # 77.
sum(is.na(sw.Df$ProjectTitleEn)) # 420. Seem very excessive! Something have happend.
# How many NA project titles are there in CASE?
sum(is.na(ca.Df$Name)) # 3

Whats the proportion of Swedish versus English titles in Case?

Code
# Add
ca.Df["language"] <- NA
# Language identifier
system.time(
  for(i in 1:length(ca.Df$Name)) {
  ca.Df$language[[i]] <- tolower(cld2::detect_language(text = ca.Df$Name[[i]],
                                                       plain_text = T,
                                                       lang_code = F))
}
)

# Correct
ca.Df$language[ca.Df$language == "norwegian"] <- "swedish"

# Summary.
ca.Df %>%
  group_by(ca.Df$language) %>%
  count(sort = T) %>%
  print(n=45)

# Unknowns, are a mix of Swedish & English titles.
ca.Df$language[is.na(ca.Df$language)] <- "Unknown"

# Other languages?
ca.Df %>% group_by(Name,language) %>%
  dplyr::select(Name,language) %>%
  filter(language == "welsh" | language == "uzbek" | 
         language == "turkish" | language == "portuguese" |
         language == "nyanja" | language == "kurdish" |
         language == "indonesian" | language == "hungarian" |
         language == "galician" | language == "croatian" |
         language == "catalan" | language == "albanian" |
         language == "afrikaans" | language == "spanish" |
         language == "kinyarwanda" | language == "ganda" |
         language == "estonian" | language == "basque" |
         language == "german") %>% 
  print(n = 28)
Code
# Subset based on intersection.
# Remove extra spaces and use case-folding to improve intersection.
merge_titles <- c(generics::intersect(
  x = gsub("\\s+"," ", iconv(tolower(na.omit(ca.Df$Name)),to = "ASCII//TRANSLIT")),
  y = gsub("\\s+"," ",iconv(tolower(na.omit(sw.Df$ProjectTitleSv)),
                            to = "ASCII//TRANSLIT"))),
  generics::intersect(x = gsub("\\s+"," ",
                               iconv(tolower(na.omit(ca.Df$Name)),to = "ASCII//TRANSLIT")),
  y = gsub("\\s+"," ",iconv(tolower(na.omit(sw.Df$ProjectTitleEn)),
                            to = "ASCII//TRANSLIT"))))
# Any duplicates? Yes.
duplicated(merge_titles)
# Remove.
merge_titles <- unique(merge_titles) # 140
# Subset
sub.ca.Df.2 <- ca.Df
sub.ca.Df.2$Name <- gsub("\\s+"," ",iconv(tolower(ca.Df$Name),
                                          to = "ASCII//TRANSLIT"))

sub.ca.Df.2 <- sub.ca.Df.2[sub.ca.Df.2$Name %in% merge_titles, ]
sub.ca.Df.2$Name[duplicated(sub.ca.Df.2$Name)] # duplicate?

# Clean.
sub.ca.Df.2 <-
  sub.ca.Df.2 %>%
  group_by(Name) %>%
  filter(!duplicated(Name))

# sweCris modification to enable inner-join.
modify.sw.Df <- sw.Df
modify.sw.Df$ProjectTitleSv <- gsub("\\s+"," ",
                                    iconv(tolower(sw.Df$ProjectTitleSv),
                                          to = "ASCII//TRANSLIT"))
modify.sw.Df$ProjectTitleEn <- gsub("\\s+"," ",
                                    iconv(tolower(sw.Df$ProjectTitleEn),
                                          to = "ASCII//TRANSLIT"))

# Merge in a two step fashion.
parseSwe <- ca.Df %>%
  filter(!is.na(Name)) %>%
  mutate(Name = gsub("\\s+"," ",iconv(tolower(na.omit(ca.Df$Name)),
                                      to = "ASCII//TRANSLIT"))) %>%
  group_by(Name) %>% 
  filter(!duplicated(Name)) %>% 
  inner_join(modify.sw.Df, by = c("Name" = "ProjectTitleSv"))
  # distinct(Name)

parseEng <- ca.Df %>%
  filter(!is.na(Name)) %>%
  mutate(Name = gsub("\\s+"," ",iconv(tolower(na.omit(ca.Df$Name)),
                                      to = "ASCII//TRANSLIT"))) %>%
  group_by(Name) %>% 
  filter(!duplicated(Name)) %>% 
  inner_join(modify.sw.Df, by = c("Name" = "ProjectTitleEn"))
  # distinct(Name)

# Match by project title enrichment.
Match.2 <- rbind(parseSwe,parseEng)
Match.2 <- Match.2 %>% filter(!duplicated(Name))
Code
# Create one title field in sweCris.
sw.Df <- sw.Df %>% mutate(Unified.Title = 
                            ifelse(is.na(ProjectTitleSv),
                                   ProjectTitleEn,ProjectTitleSv))
# Duplicated Project Names in Case.
ca.Df %>% add_count(Name) %>% filter(n > 1) %>% distinct() %>% 
  group_by(Name) %>% summarise(n = n()) # %>% count(n)

# What about in sweCris?
sw.Df %>% add_count(Unified.Title) %>% filter(n > 1) %>% distinct() %>% 
  group_by(Unified.Title) %>% summarise(n = n()) %>% count(n)
Storing counts in `nn`, as `n` already present in input
ℹ Use `name = "new_name"` to pick a new name.

Results (2/3)

  1. 140 exact project matches based on Project Title. This is on par with the number of matches captured using Project ID. An additional 11 projects could be added based on fuzzy matching using a threshold of 0.85.
  2. Data-quality issues: Three NA’s in the Name/Title field in Case were found. At least 4 project titles have triplet occurrences, and 15 titles occur twice (i.e. true duplicates) in Case.
  3. Data-quality issues: In sweCris there are 78 project titles that are pure duplicates, 5 triplets, and four quadruplets.
Code
## Venn diagram showing projects overlap based on project title.
mP2 <- draw.pairwise.venn(area1= 3068,area2 = 746,
                          cross.area = 140,
                          category   = c("SweCris-titled projects",
                                         "CASE-titled projects"),
                          fill = c("grey80","darkcyan"),
                          scaled = T,cex = 1,
                          rotation.degree = 45)
Code
ggarrange(mP2) + theme(aspect.ratio = 1)

Figure 9. Venn diagram showing project overlap based on project title. Area 1 (Grey) = sweCris. Area 2 (Red) = Case


Combined results

Code
ggarrange(mP1,mP2) + theme(aspect.ratio = 1/2)

Figure 10. Comparison of results

Does ID vs Title identifiers return the same matching projects?

No. Only, 79 projects are the same. In other words, the project match depend to some extent on the identifier being used. Jointly, however, ID and Project Title yield 203 project matches against sweCris.

Code
# Project intersections.
generics::intersect(gsub("\\s+"," ",
                         iconv(tolower(na.omit(MissCheck.1$Name)),
                               to = "ASCII//TRANSLIT")),Match.2$Name)
# Projects set apart.
generics::setdiff(gsub("\\s+"," ",
                       iconv(tolower(na.omit(MissCheck.1$Name)),
                             to = "ASCII//TRANSLIT")),Match.2$Name)
# Combine.
Total.M1 <- merge(MissCheck.1,Match.2,all = T)
Total.M1$Name <- iconv(tolower(Total.M1$Name),to = "ASCII//TRANSLIT")
# Subset.
Total.M1 <- Total.M1 %>% filter(!duplicated(Name))

Fuzzy-matching based on title sound

Gestalt pattern matching

This adds an additional 9 projects using the soundex method followed by jarowinkler computation of similarity-index.

Code
fuzzy_Title_SX <- ca.Df %>%
  fuzzyjoin::stringdist_inner_join(sw.Df, by=c(Name="Unified.Title"),
                       method="soundex", max_dist = 0) %>%
  filter(!duplicated(ProjectId)) %>%
  filter(!duplicated(Name))

fuzzy_Title_SX["JW"] <- lev <- jarowinkler(fuzzy_Title_SX$Name,
                                           fuzzy_Title_SX$Unified.Title)
# Check again
EXA <- fuzzy_Title_SX %>% 
  filter(JW >= .85)

EXA <- EXA %>%
  mutate(ProjectTitleEn = gsub("\\s+"," ",iconv(tolower(EXA$ProjectTitleEn),
                                                to = "ASCII//TRANSLIT")))
# How many extra? 11 projects.
rM <- generics::intersect(Match.2$ProjectTitleEn,
                          gsub("\\s+"," ",iconv(tolower(EXA$ProjectTitleEn),
                                                to = "ASCII//TRANSLIT")))

EXB <- EXA[!(EXA$ProjectTitleEn %in% c(rM)), ]

# Combine extras with exact title matches.
Match.2B <- rbind(Match.2,EXB)
Match.2B <- Match.2B %>% filter(!duplicated(Name))

3. Key: Primary Researcher

  1. The InvolvedPeople column have a complex string structure with relevant information that ought to be parsed out to facilitate maximum matching with the CASE database. The main problem here is that the name of the project leaders, including participating researchers are bind into poorly structured text strings. To circumvent, I have attempted to parse out name and role based on their general position in each string.

  2. The primary researcher information is missing from 696 projects (22%) in SweCris. 17 of these do however include partial information, but not the name of the PI. Why?

  3. The name of 2441 Principal Investigators (77.81%) could successfully be retrieved via sweCris.

  4. 69 exact name matches between CASE and sweCris.

Code
sw.Df["PrimaryResearcher"] <- ""
sw.Df["Leading.Role"] <- ""

# Generalize.
for(i in 1:length(sw.Df$InvolvedPeople)) {
  # Index.
  iX <- strsplit(sw.Df$InvolvedPeople[[i]],"¤")[[1]][-c(1:3)]
  # Attain name via PI position
  sw.Df$PrimaryResearcher[[i]] <- 
    iX[grep("Principal Investigator",
            strsplit(sw.Df$InvolvedPeople[[i]],"¤")[[1]][-c(1:3)])[1]-2]
  # Clean names, with no extra spaces.
  sw.Df$PrimaryResearcher[[i]] <- gsub("\\s+"," ",sw.Df$PrimaryResearcher[[i]])
  # Remove accents.
  sw.Df$PrimaryResearcher[[i]] <- iconv(sw.Df$PrimaryResearcher[[i]],
                                        to = "ASCII//TRANSLIT")
  # Role.
  sw.Df$Leading.Role[[i]] <- 
    iX[grep("Principal Investigator",
            strsplit(sw.Df$InvolvedPeople[[i]],"¤")[[1]][-c(1:3)])[1]]
}
Code
# Summarize.
sw.Df %>% group_by(Leading.Role) %>% count(sort = T)

sw.Df %>% group_by(PrimaryResearcher) %>%
  filter(!is.na(PrimaryResearcher)) %>%
  # filter(PrimaryResearcher != "Unknown") %>%
  count(sort = T) %>%
  filter(n > 10) %>%
  ggplot(aes(x = as.factor(PrimaryResearcher), y = n)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() + 
  labs(title = "Graph showing primary researcher with >10 projects in sweCris",
       x = "Primary Researcher")

Figure X. Primary Researchers in sweCris with projects > 10

Code
# Summarize (Case).
ca.Df %>%
  group_by(PrimaryResearcher) %>%
  count(sort = T)
Code
# Number of projects by Primary Researcher, along with Leading Role.
sw_primary <- sw.Df %>% group_by(PrimaryResearcher) %>% count(sort = T) %>%
  filter(n > 10) %>% 
  ggplot(mapping = aes(x = reorder(PrimaryResearcher,-n), y = n)) +
  geom_col() + coord_flip() +
  labs(title = "Number of projects by Primary Researcher: n > 10")
Code
# swecris_project_people(project_id = "2021-00527_Formas")
Code
# Re-code.
sw.Df[is.na(sw.Df)] <- "Unknown"
sw.Df.Unknown <- subset(sw.Df,(Leading.Role %in% "Unknown"))
# Rows (n=17) were Involved People have limited information, but not PI name.
string.InNA <- which(grepl(pattern = "¤¤¤",x = sw.Df.Unknown$InvolvedPeople) == T)
# Subset to retain all NA.
sw.df.Partial <- sw.Df.Unknown[string.InNA, ]
Code
# Remove accent from primary research names in CASE.
ca.Df$PrimaryResearcher <- iconv(ca.Df$PrimaryResearcher,to = "ASCII//TRANSLIT")
# How many name matches?
generics::intersect(na.omit(ca.Df$PrimaryResearcher),
                    na.omit(sw.Df$PrimaryResearcher)) %>% unique()
generics::intersect(na.omit(ca.Df$PrimaryResearcher),
                    na.omit(sw.Df$PrimaryResearcher)) %>% unique() %>% length()
Code
Match.3A <- ca.Df %>% 
  inner_join(sw.Df, by = c("PrimaryResearcher", "Name" = "ProjectTitleEn")) %>%
  filter(!is.na(PrimaryResearcher)) %>%
  filter(!duplicated(Name))

Match.3B <- ca.Df %>% 
  inner_join(sw.Df, by = c("PrimaryResearcher", "Name" = "ProjectTitleSv")) %>%
  filter(!is.na(PrimaryResearcher)) %>%
  filter(!duplicated(Name))
Warning in inner_join(., sw.Df, by = c("PrimaryResearcher", Name = "ProjectTitleSv")): Each row in `x` is expected to match at most 1 row in `y`.
ℹ Row 1671 of `x` matches multiple rows.
ℹ If multiple matches are expected, set `multiple = "all"` to silence this
  warning.
Code
Match.3C <- bind_rows(Match.3A,Match.3B) %>% filter(!duplicated(Name))
Code
bazzi.na.Total(x = Match.3C, col = "black",typographic = T) + 
  theme(axis.text.x = element_text(size = 9,angle = 45),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9), 
        axis.title.y.right = element_text(size = 9))

Figure 12. Pareto chart for missing values (counts and proportions) in the reduced CASE dataset.

Results (3/3)

69 exact project matches based on Primary Researcher and Project Name.

Code
## Venn diagram showing projects overlap based on project title.
mP3 <- draw.pairwise.venn(area1= 3068,area2 = 746,
                          cross.area = 69,
                          category = c("SweCris: Primary Researcher",
                                       "CASE: PrimaryResearcher"),
                          fill = c("grey80","blue"),
                          cat.col = "black",cat.cex = 1,
                          scaled = T,cex = 1,
                          rotation.degree = 45)
Code
ggarrange(mP3) + theme(aspect.ratio = 1)

Figure 13. Venn diagram showing project overlap based on Primary Researcher Area 1 (Grey) = sweCris. Area 2 (Red) = Case

Code
# Combine ID, Title, and Primary Research
Total.M2 <- merge(Total.M1,Match.3C,all = T)
Total.M2$Name <- tolower(iconv(Total.M2$Name,to = "ASCII//TRANSLIT"))
Total.M2 <- Total.M2 %>% filter(!duplicated(Name)) # 203 unique projects in total.

# And if limited to Research projects only.
Total.M3 <- Total.M2 %>% filter(type == "Research project") # 174 in total.
Total.M3 %>% group_by(FundingOrganisation) %>% count(sort = T)
# A tibble: 10 × 2
# Groups:   FundingOrganisation [10]
   FundingOrganisation       n
   <chr>                 <int>
 1 VINNOVA                  51
 2 Energimyndigheten        48
 3 Vinnova                  25
 4 VR                       22
 5 Formas                   18
 6 EU                        6
 7 Forte                     1
 8 Not Applicable N/A        1
 9 Other national funder     1
10 Rymdstyrelsen             1
Code
# Recode
Total.M3$FundingOrganisation <- recode(Total.M3$FundingOrganisation,
                                       Vinnova = "VINNOVA")
Code
knitr::kable(x = Total.M3 %>%
               group_by(FundingOrganisation) %>%
               count(sort = T), caption = 
               "Table 4. Matched projects and corresponding funding organisation") %>%
  kable_styling()
Table 4. Matched projects and corresponding funding organisation
FundingOrganisation n
VINNOVA 76
Energimyndigheten 48
VR 22
Formas 18
EU 6
Forte 1
Not Applicable N/A 1
Other national funder 1
Rymdstyrelsen 1
Code
ggarrange(mP1,mP2,mP3,nrow = 1,heights = c(2,2),widths = c(4,4)) + 
  theme(aspect.ratio = 1/2)

Figure 14. Venn diagram showing project overlap based on Id, Title, and Primary Researcher.

Code
# Table preparations.
match_per_org <- Total.M3 %>% 
  group_by(FundingOrganisation) %>%
  count(sort = T) %>% 
  filter(FundingOrganisation != "Not Applicable N/A") %>%
  filter(FundingOrganisation != "Other national funder") %>%
  filter(FundingOrganisation != "EU") %>% ungroup() %>%
  add_row(FundingOrganisation = "Riksbankens Jubileumsfond",n = 0)

# Reorder
match_per_org <- match_per_org[c(4,5,7,6,2,3,1),]
# Combine.
Tab.4 <- cbind(Tab.3,match_per_org)
New names:
• `Sample` -> `Sample...2`
• `Sample` -> `Sample...4`
Code
class(Tab.4) <- "data.frame"

# Remove redundant field.
Tab.4 <- Tab.4[,-c(3,5)]
# Rename columns.
colnames(Tab.4)[1] <- "Funding Organisation"
colnames(Tab.4)[2] <- "n_Swecris"
colnames(Tab.4)[3] <- "n_Case"
colnames(Tab.4)[4] <- "n_matches"
# Reorder
Tab.4 <- Tab.4[,c(1,3,2,4)]
# Proportions.
Tab.4["%_matched"] <- round(Tab.4[,4]/Tab.4[,2] * 100,digits = 0) # no decimal
# Sum of porject matches.
sum(Tab.4$n_matches) + 8 # 174 as in Total.M3 (Research Projects)
[1] 174
Code
knitr::kable(x = Tab.4,
caption = "Table 1. Summary of Agencies Involved in Project Funding for Case and Swecris. The Swecris dataset has seven funding agencies and the Case dataset has 29. The funding organization is unknown for 577 projects in Case. A hypothetical project match of 755 is assumed.") %>%
  kable_styling() %>%
  column_spec(4, bold = T, color = "white", background = "blue") %>%
  column_spec(5, bold = T, color = "white", background = "grey") %>%
  add_footnote(label = "6 research projects funded by the EU, 1 designated as Not Applicable, and 1 other national funder were also intersected (i.e. matched)",
               notation = "symbol",threeparttable = T)
Table 1. Summary of Agencies Involved in Project Funding for Case and Swecris. The Swecris dataset has seven funding agencies and the Case dataset has 29. The funding organization is unknown for 577 projects in Case. A hypothetical project match of 755 is assumed.
Funding Organisation n_Case n_Swecris n_matches %_matched
Formas 93 407 18 19
Forte 8 13 1 12
Riksbankens Jubileumsfond 6 12 0 0
Rymdstyrelsen 7 74 1 14
Statens energimyndighet 206 371 48 23
Vetenskapsrådet 116 1559 22 19
Vinnova 330 701 76 23
* 6 research projects funded by the EU, 1 designated as Not Applicable, and 1 other national funder were also intersected (i.e. matched)

Section summary — Case vs sweCris

The main results revealed several issues with the data quality in the CASE dataset, particularly in the Project Number field which would benefit from a re-structuring and consistent ID formatting across projects to make database comparisons more efficient.

The results of the study found that the number of project matches depends on how ID standardization is done. Using the two predominant ID styles in sweCris as a reference, 139 distinct project matches were obtained. However, generally, most CASE IDs are written inconsistently, making it difficult to merge with sweCris. The issue is further compounded by variable ID formats in sweCris, although it is generally better than CASE. Out of 384 CASE projects with a clean ID, only 36% (N=139) could be matched against the sweCris dataset. Despite the low number of project matches, the added information retrieved from sweCris is generally complete (column-wise) as the data quality in sweCris tends to be better.

Results based on the project title found 140 matches, which is on par with the number of matches captured using the project ID. However, there are data-quality issues in both datasets, with three missing values in the Name/Title field in CASE and 78 project titles that are pure duplicates, 5 triplets, and four quadruplets in sweCris. Using ID vs Title identifiers returned different matching projects, with only 79 projects being the same. Jointly, however, ID and Project Title yield 203 project matches against sweCris.

The results based on the primary researcher found that the InvolvedPeople column has a complex string structure with relevant information that ought to be parsed out to facilitate maximum matching with the CASE database. The name of 2441 Principal Investigators (77.81%) could be successfully collected via sweCris and 69 exact project matches between CASE and sweCris were found. In total, 203 unique projects can be matched based on ID, Title, and Primary Researcher. This means that of the 746 potential project fits, 27% were captured, with 73% missing.

Ancillary remarks. Obviously, a primary risk is the difficulty in integrating with other systems. (e.g. sweCris). Data quality is also a concern as Case have different levels of data quality. Additionally, if the databases are not representative of the population (e.g. ITM) being studied, it can lead to biased conclusions from the analysis. Lastly, if the databases contain duplicates, it can lead to confusion and inaccuracies in the analysis.


4. External Key: ORCID ID (KTH’s user database)

Code
# ugkthId & Orcid.
ug <- readr::read_csv(rawToChar(kthcorpus::minio_get("ug_kthid_orcid.csv",
                                                     bucket = "kthcorpus")))
# unique ugkthId.
lookup <- readr::read_csv(rawToChar(kthcorpus::minio_get("ug_kthid_unit.csv",
                                                         bucket = "kthcorpus")))
Code
# How many Primary Researchers in CASE dont have an CORCID ID?
# N = 131.
ca.Df %>%
  filter(type == "Research project") %>%
  group_by(username) %>% count() %>% arrange(desc(n)) %>%
  left_join(lookup, by = c("username" = "ugUsername")) %>%
  dplyr::select(username, ugOrcid, ugKthid, n) %>%
  filter(is.na(ugOrcid))
# A tibble: 133 × 4
# Groups:   username [133]
   username ugOrcid ugKthid     n
   <chr>    <chr>   <chr>   <int>
 1 regm     <NA>    <NA>       19
 2 larsges  <NA>    <NA>       11
 3 jeanna   <NA>    <NA>        7
 4 xiaoqli  <NA>    <NA>        7
 5 bergsman <NA>    <NA>        6
 6 glodic   <NA>    <NA>        6
 7 aman     <NA>    <NA>        5
 8 carmim   <NA>    <NA>        5
 9 maranti  <NA>    <NA>        5
10 samer    <NA>    <NA>        5
# … with 123 more rows
Code
# Merge by username and remove incomplete cases (subset by ugOrcid)
# N = 1380 research projects.
exDf.ca <- ca.Df %>%
  filter(type == "Research project") %>%
  left_join(lookup, by = c("username" = "ugUsername")) %>%
  filter(!is.na(ugOrcid)) %>%
  filter(!duplicated(Name))

sapply(exDf.ca[c("ProjectNumber","Name")],function(x) duplicated(x))

# Unique ORCID IDs (N = 470)
distinct.orcidID <- unique(exDf.ca$ugOrcid)
# Plot the data information.
avail_orcid <- data.frame(x = c("With an orcid number","Without an orcid number"),
                          y = c(1380,131))
avail_orcid %>% 
  ggplot(mapping = aes(x = x, y = y)) +
  geom_col() + labs(title = "Research projects in Case") + theme(aspect.ratio = 1.5)

Call sweCris api to get project results

Code
# Initialize empty list to store the results
res <- list()
# Iterate over each ORCID ID
for (i in 1:470){
    orcid <- distinct.orcidID[i]
    res[[i]] <- tryCatch(swecris::swecris_projects_from_orcid(orcid),
                         error=function(e) e)
}

# Flatten.
op <- unlist(res,recursive = FALSE)
# Projects.
op_projects <- op[sapply(op, function(x) any(grepl("project", names(x))))]
projects <- do.call(rbind, op_projects)
projects <- projects %>% distinct()
# People list.
op_peopleList <- op[sapply(op, function(x) any(grepl("fullName", names(x))))]
peopleList  <- do.call(rbind,op_peopleList)
peopleList %>% group_by(fullName,roleEn) %>% count()
# Scbs.
op_scbs <- op[sapply(op, function(x) any(grepl("scb1NameEn", names(x))))]
scbs <- do.call(rbind, op_scbs)

A total of 986 projects were retrieved by using orchid-ID’s associated with primary researchers at KTH. The question that arises is to what extent these projects align with those in CASE.

Code
# Match clean CASE data versus Orcid-data based on Project ID.
Match.4 <- cleanID.CASE %>%
  dplyr::inner_join(projects,by = c(clean.IDA = "projectId"),keep = TRUE)
Code
# Clean title names first a bit.
ca.Df <- ca.Df %>%
  mutate(clean_name =  gsub("\\s+"," ", iconv(tolower(ca.Df$Name),
                                              to = "ASCII//TRANSLIT")))

projects <- projects %>%
  mutate(clean_name_swe =  gsub("\\s+"," ", iconv(tolower(projects$projectTitleSv),
                                                  to = "ASCII//TRANSLIT"))) %>%
  mutate(clean_name_eng =  gsub("\\s+"," ", iconv(tolower(projects$projectTitleEn),
                                                  to = "ASCII//TRANSLIT"))) 
# Match.
Match.5A <- inner_join(ca.Df, projects, by = c("clean_name" = "clean_name_eng"))
Match.5B <- inner_join(ca.Df, projects, by = c("clean_name" = "clean_name_swe"))
Match.5C <- dplyr::bind_rows(Match.5A,Match.5B) # 72 project matches.
Code
# CASE copy
case.orcid <- ca.Df
# Fussy attempt on title.
Match.5D <- stringdist_inner_join(x = case.orcid[!is.na(case.orcid$Name),],
                                  y = projects, 
                                  by = c("clean_name" = "clean_name_eng"))

Match.5E <- stringdist_inner_join(x = case.orcid[!is.na(case.orcid$Name),],
                                  y = projects,
                                  by = c("clean_name" = "clean_name_swe"))

Match.5F <- bind_rows(Match.5D,Match.5E)

# Plot matching results.
Orcid.Match.Res <- data.frame(ID = c("48","72","91"),
                              class = c("Exact match: Case vs Orcid ID",
                                        "Exact match: Case vs Orcid Title",
                                        "Fuzzy match: Case vs Orcid Title"))

Section summary — Case vs Orcid

Out of the primary researchers in Case, 131 do not have an ORCID ID. In total, there are 470 distinct ORCID ID’s associated with research projects in Case. By using these ORCID ID’s to retrieve data through sweCris, 986 projects were obtained. To determine if there is overlap between these projects and those in Case, I compared the exact ID’s and project titles. The results indicate that there is very little overlap between the projects in Case and those found in the sweCris database. It’s unclear why these data source are so seemingly difficult to compare.

Results: Project overlap based on ID between the original Case data and sweCris downloaded data using orcid-id’s of Case listed primary researchers and here informed via the ug-file, only return an exact match of 48 projects. Conversely, a total of 72 exact matches could be returned or found using the title field instead. A fuzzy match based on project titles returned instead 91 observations.


CORDIS

CORDIS is the primary source of results from EU-funded projects.

Code
# cordis::cordis_import(refresh = T)
cordis::cordis_schema()
# A tibble: 421 × 7
   tablename                  cid name             type    notnull dflt_…¹ pk   
   <chr>                    <int> <chr>            <chr>   <lgl>   <chr>   <lgl>
 1 fp7_dm_proj_publications     0 PROJECT_ID       DOUBLE  FALSE   <NA>    FALSE
 2 fp7_dm_proj_publications     1 TITLE            VARCHAR FALSE   <NA>    FALSE
 3 fp7_dm_proj_publications     2 AUTHOR           VARCHAR FALSE   <NA>    FALSE
 4 fp7_dm_proj_publications     3 DOI              VARCHAR FALSE   <NA>    FALSE
 5 fp7_dm_proj_publications     4 PUBLICATION_TYPE VARCHAR FALSE   <NA>    FALSE
 6 fp7_dm_proj_publications     5 REPOSITORY_URL   VARCHAR FALSE   <NA>    FALSE
 7 fp7_dm_proj_publications     6 JOURNAL_TITLE    VARCHAR FALSE   <NA>    FALSE
 8 fp7_dm_proj_publications     7 PUBLISHER        VARCHAR FALSE   <NA>    FALSE
 9 fp7_dm_proj_publications     8 VOLUME           VARCHAR FALSE   <NA>    FALSE
10 fp7_dm_proj_publications     9 PAGES            VARCHAR FALSE   <NA>    FALSE
# … with 411 more rows, and abbreviated variable name ¹​dflt_value
Code
cordis::cordis_tables() %>% print(n = 41)
# A tibble: 41 × 2
   table                              n_row
   <chr>                              <dbl>
 1 fp7_dm_proj_publications          305549
 2 fp7_euroSciVoc                     68651
 3 fp7_legalBasis                     25785
 4 fp7_organization                  140055
 5 fp7_project                        25785
 6 fp7_projectirps                     5293
 7 fp7_reportSummaries                21606
 8 fp7_topics                         26153
 9 fp7_webItem                        11767
10 fp7_webLink                         7989
11 h2020_euroSciVoc                  122551
12 h2020_legalBasis                   65784
13 h2020_organization                177078
14 h2020_pi                            8043
15 h2020_project                      35382
16 h2020_projectDeliverables         137319
17 h2020_projectIrps                   2324
18 h2020_projectPublications         318740
19 h2020_reportSummaries              27082
20 h2020_scoreboard                 1048576
21 h2020_topics                       35382
22 h2020_webItem                          9
23 h2020_webLink                     162178
24 he_euroSciVoc                      15810
25 he_legalBasis                       7657
26 he_organization                    36680
27 he_project                          5250
28 he_projectDeliverables               205
29 he_topics                           5250
30 he_webLink                           241
31 ref_countries                       1503
32 ref_fp6programmes                   2027
33 ref_fp7programmes                   6233
34 ref_fp7subprogrammes                6096
35 ref_h2020programmes                 3905
36 ref_h2020topicKeywords              2562
37 ref_h2020topics                     3905
38 ref_horizonprogrammes                123
39 ref_horizontopics                   2047
40 ref_organizationactivitytype           5
41 ref_projectfundingschemecategory     187
Code
# Connection.
con <- cordis_con()
Code
he_project_ids <- con |> tbl("he_organization") |>
  filter(shortName == "KTH") |> collect() |>
  distinct(projectID, projectAcronym) |>
  pull(projectID)

# HORIZON EUROPE projects
he_projects <-
  con |> tbl("he_project") |> 
  filter(id %in% he_project_ids) |> collect()

fp7_project_ids <-
  con |> tbl("fp7_organization") |>
  filter(shortName == "KTH") |> collect() |>
  distinct(projectID, projectAcronym) |>
  pull(projectID)

# EU research projects under FP7
fp7_projects <- con |> tbl("fp7_project") |> 
  filter(id %in% fp7_project_ids) |> collect() |>
  mutate(totalCost = readr::parse_number(totalCost))

# Horizon 2020 projects
h2020_project_ids <-
  con |> tbl("h2020_organization") |>
  filter(shortName == "KTH") |> collect() |>
  distinct(projectID, projectAcronym) |>
  pull(projectID)

h2020_projects <- con |> tbl("h2020_project") |> 
  filter(id %in% h2020_project_ids) |> collect()

# Combine Horizon Europe, H2020 and FP7 projects into one table
kth_cordis <-
  list(h2020_projects, he_projects, fp7_projects) |> bind_rows() # 753 KTH projects.
Code
# EuroSciVoc taxonomy.
he_taxonomy <- con %>% tbl("he_euroSciVoc") %>%
  as.data.frame() %>% 
  filter(projectID %in% kth_cordis$id) |> collect()

fp7_taxonomy <- con %>% tbl("fp7_euroSciVoc") %>%
  as.data.frame() %>%
  filter(projectID %in% kth_cordis$id) |> collect()

h2020_taxonomy <- con %>% tbl("h2020_euroSciVoc") %>%
  as.data.frame() %>% 
  filter(projectID %in% kth_cordis$id) |> collect()

# Joint list
taxonomy_df <- list(he_taxonomy,fp7_taxonomy,h2020_taxonomy) %>%
  bind_rows()
Code
# Plot funding schemes in cordis.
cordis_funding_org <- kth_cordis %>% 
  group_by(fundingScheme) %>% 
  count() %>%
  ggplot(mapping = aes(x = reorder(fundingScheme, -n),y = n)) +
  labs(title = "Funding Scheme", x = "") + coord_flip() +
  geom_col() + theme(axis.text.x = element_text(angle = 45)) +
  theme(axis.text.y = element_text(size = 3.5))

# Plot completness in cordis.
cordis_quality <- bazzi.na.Total(kth_cordis)

# Funding size plot
range(kth_cordis$totalCost) # there are some format issue here.
[1]           0 17236976762
Code
# Plot subject areas
cordis_master <- kth_cordis %>% 
  mutate(id_clean = as.numeric(id)) %>%
  inner_join(taxonomy_df[!duplicated(taxonomy_df$projectID),],
             by = c("id_clean" = "projectID"))

cordis_subjects <- cordis_master %>% 
  group_by(euroSciVocTitle) %>%
  count(sort = T) %>%
  filter(n > 5) %>%
  ggplot(mapping = aes(x = euroSciVocTitle, y = n)) +
  geom_col() + theme(axis.text.x = element_text(size = 4),
                     axis.text.y = element_text(size = 4)) + coord_flip() + 
  labs(title = "Subject categories: n > 5", x = "")

# Disconnect.
# on.exit(cordis_disconnect(con))

Cordis vs Case intersection

Code
# Title match? 231 exact matches.
cordis_match_a <- kth_cordis %>%
  dplyr::inner_join(ca.Df, 
                    by = c("title" = "Name"),
                    multiple = "all") %>%
  filter(!duplicated(title))

# In terms of project matches, what are the funding calls?
cordis_match_funding <- cordis_match_a %>% 
  group_by(fundingScheme) %>%
  count(sort = T) %>% ggplot(mapping = aes(x = reorder(fundingScheme,-n), y = n)) +
  geom_col() + coord_flip() + 
  labs(title = "Cordis vs. Case project matches: Funding Schemes")

# Fuzzy matching? 256 project matches.
cordis_match_b <- 
  ca.Df[!is.na(ca.Df$Name),] %>%
  stringdist_inner_join(kth_cordis, by = c("Name" = "title"), max_dist = 1) %>%
  filter(!duplicated(title))
Code
# Veen diagram of the above results.
mP4 <- draw.pairwise.venn(area1 = 2611,area2 = 753,
                          cross.area = 231,
                          category = c("Cordis: Title",
                                       "Case: Title"),
                          fill = c("grey80","blue"),
                          cat.col = "black",cat.cex = 1,
                          scaled = T,cex = 1,
                          rotation.degree = 45)

Cordis vs sweCris

Code
# sweCris versus Cordis using title. Only 6 projects.
cordis_match_c <- sw.Df %>%
  inner_join(kth_cordis, by = c("ProjectTitleEn" = "title"))

Section summary — Cordis

OpenAIRE

Code
open_aire_kth_xml <- openaire_crawl("projects", page_size = 100, params = api_params(
  format = "xml", 
  proj_country = "SE",
  proj_org = "Royal Institute of Technology"))

open_aire_kth_tsv <- openaire_crawl("projects", page_size = 100, params = api_params(
  format = "tsv", 
  proj_country = "SE",
  proj_org = "Royal Institute of Technology"))
Code
# Generate a handful of graphs reflecting openAIRE data content
openAire_quality <- bazzi.na.Total(open_aire_kth_xml)
# Plot funding details.
openAire_funding_org <- open_aire_kth_xml %>% 
  group_by(funder_name) %>%
  count() %>%
  ggplot(mapping = aes(x = reorder(funder_name, -n),y = n)) + 
  labs(title = "openAire",x = "") + coord_flip() +
  geom_col() + theme(axis.text.x = element_text(angle = 45))

# OA is mandated
openAire_OA_A <- open_aire_kth_xml %>%
  group_by(oa_is_mandated) %>% count(sort = T) %>%
  ggplot(aes(x = oa_is_mandated, y = n, fill = oa_is_mandated)) + 
  geom_col() + labs(x = "", title = "Funding requirement")

# Add new variable.
open_aire_kth_xml["startYear"] <- 
  format(as.Date(open_aire_kth_xml$beg_date),"%Y")

openAire_OA_B <-
  open_aire_kth_xml %>% group_by(oa_is_mandated,startYear) %>% count(sort = T) %>%
  ggplot(aes(x = startYear, y = n, fill = oa_is_mandated)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 45,size = 6)) +
  labs(x = "", title = "OA mandated projects over time")

openAire versus Case

Code
# Project match on title.
Match.7A <- ca.Df %>% 
  inner_join(open_aire_kth_tsv, by = c("Name" = "Project title"), keep = T)

# ... 225 project matches.
Match.7B <- ca.Df %>% 
  inner_join(open_aire_kth_xml, by = c("Name" = "title"), keep = T)

openAire versus Swecris

Code
# Match against sweCris
Match.7C <- sw.Df %>% 
  inner_join(open_aire_kth_xml, by = c("ProjectTitleEn" = "title"), keep = T) # 6 only

openAire versus Cordis (Title)

Code
# 1. Cordis verus openAire using title. 
cordis_vs_openAire_a <- kth_cordis %>% 
  inner_join(open_aire_kth_xml, by = "title", multiple = "all") %>% 
  filter(!duplicated(title)) # 664 KTH projects.

# 2. Fuzzy matching
cordis_vs_openAire_b <- kth_cordis %>% 
  stringdist_inner_join(open_aire_kth_xml, by = "title") # 701 projects.

openAire versus Cordis (ID)

Code
kth_cordis$id <- as.character(kth_cordis$id)
# Match
# cordis_vs_openAire_c <- open_aire_kth_xml %>% 
#  inner_join(kth_cordis, by = c("clean.ID" = "id")) # 385

Combine all data-sources

Code
df_list <- list(Total.M3,cordis_match_a,Match.7B,cordis_vs_openAire_a)

joined_df <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) %>%
  filter(!duplicated(title))