Data from the Research Organization Registry Community • institutions

library(institutions)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

institutions_download()

A static dataset from October 2020 from the Research Organization Registry Community has been bundled in this R package.

It provides GRID identifiers for research organizations and can therefore be used to link to the data otherwise provided through functions in this R package.

Some examples of usage follow:


# get a GRID identifier for a specific institution
id_kth_grid <- 
  institutions_search("Royal Institute of Technology")$grid_id

# get the corresponding ROR identifier
id_kth_ror <- 
  ror$ror_orgs %>%
  filter(GRID == id_kth_grid) %>%
  pull(id)

# what ROR core info can we get for this identifier?
ror$ror_orgs %>% filter(id == id_kth_ror) %>% glimpse()
#> Rows: 1
#> Columns: 12
#> $ id            <chr> "https://ror.org/026vcq606"
#> $ name          <chr> "KTH Royal Institute of Technology"
#> $ email_address <lgl> NA
#> $ established   <int> 1827
#> $ status        <chr> "active"
#> $ wikipedia_url <chr> "https://en.m.wikipedia.org/wiki/KTH_Royal_Institute_of_…
#> $ country_name  <chr> "Sweden"
#> $ country_code  <chr> "SE"
#> $ FundRef       <chr> NA
#> $ GRID          <chr> "grid.5037.1"
#> $ ISNI          <chr> NA
#> $ Wikidata      <chr> NA

# what other identifiers are associated with this org?
ror$ror_ids %>% filter(id == id_kth_ror)
#> # A tibble: 4 × 3
#>   id                        name     value              
#>   <chr>                     <chr>    <chr>              
#> 1 https://ror.org/026vcq606 FundRef  501100004270       
#> 2 https://ror.org/026vcq606 GRID     grid.5037.1        
#> 3 https://ror.org/026vcq606 ISNI     0000 0001 2158 1746
#> 4 https://ror.org/026vcq606 Wikidata Q854280

# ROR child table entries for labels, aliases, acronyms, links and types?

# what aliases?
ror$ror_aliases %>% filter(id == id_kth_ror)
#> # A tibble: 1 × 2
#>   id                        aliases                      
#>   <chr>                     <chr>                        
#> 1 https://ror.org/026vcq606 Royal Institute of Technology

# what acronyms?
ror$ror_acronyms %>% filter(id == id_kth_ror)
#> # A tibble: 1 × 2
#>   id                        acronyms
#>   <chr>                     <chr>   
#> 1 https://ror.org/026vcq606 KTH

# what links?
ror$ror_links %>% filter(id == id_kth_ror)
#> # A tibble: 1 × 2
#>   id                        links             
#>   <chr>                     <chr>             
#> 1 https://ror.org/026vcq606 https://www.kth.se

# what types?
ror$ror_types %>% filter(id == id_kth_ror)
#> # A tibble: 2 × 2
#>   id                        types    
#>   <chr>                     <chr>    
#> 1 https://ror.org/026vcq606 Education
#> 2 https://ror.org/026vcq606 Funder

Some summaries

The full dataset is available and can be summarized.

Using “child tables” for summaries

A few examples:


# what top five types of orgs are represented in this data?
ror$ror_types %>% count(types) %>% arrange(desc(n)) %>% head(5)
#> # A tibble: 5 × 2
#>   types          n
#>   <chr>      <int>
#> 1 Company    30784
#> 2 Education  22186
#> 3 Funder     17122
#> 4 Nonprofit  15068
#> 5 Healthcare 13933

# top 5 companies with many aliases
ror$ror_aliases %>% count(aliases) %>% arrange(desc(n)) %>% head(5)
#> # A tibble: 5 × 2
#>   aliases                                         n
#>   <chr>                                       <int>
#> 1 Merck Sharp & Dohme                            15
#> 2 Hoffmann-La Roche                              13
#> 3 ASEA Brown Boveri                              11
#> 4 E. I. du Pont de Nemours and Company           11
#> 5 International Business Machines Corporation    11

# why does IBM have so many acronyms? seemingly because it is a multi-national corp.
ror$ror_aliases %>% filter(aliases == "International Business Machines Corporation") %>% 
  inner_join(ror$ror_orgs) %>%  pull(name) %>% paste0(collapse = ", ")
#> Joining with `by = join_by(id)`
#> [1] "IBM (France), IBM (Belgium), IBM (India), IBM (Egypt), IBM (Denmark), IBM (Portugal), IBM (Czechia), IBM (Ireland), IBM (Brazil), IBM (Slovakia), IBM (Germany)"

# what is the most frequently used acronym and which orgs use this abbreviation for its name?
ror$ror_acronyms %>% count(acronyms) %>% arrange(desc(n))
#> # A tibble: 33,870 × 2
#>    acronyms     n
#>    <chr>    <int>
#>  1 CCC         44
#>  2 MU          33
#>  3 AU          32
#>  4 SCC         31
#>  5 HCC         30
#>  6 MCC         30
#>  7 CRC         29
#>  8 CMC         27
#>  9 MSU         27
#> 10 ARC         26
#> # ℹ 33,860 more rows
ror$ror_acronyms %>% filter(acronyms == "CCC") %>% inner_join(ror$ror_orgs) %>% pull(name) %>% paste0(collapse = ", ")
#> Joining with `by = join_by(id)`
#> [1] "Clatterbridge Cancer Centre NHS Foundation Trust, Committee on Climate Change, Camden County College, Capital Consulting Corporation (United States), IEA Clean Coal Centre, Colby Community College, Crile Carvey Consulting, Children's Cancer Center, Community Cancer Center, Capital Community College, Clatsop Community College, Central City Concern, Cleveland Community College, California Coastal Commission, California Conservation Corps, Chinese Culture Center of San Francisco, Clearwater Christian College, Clovis Community College, Coahoma Community College, Cumberland County College, Carolina Christian College, Consolidated Contractors Company (Greece), Community Care College, California Christian College, Craven Community College, Coffeyville Community College, Carteret Community College, Center for Community Change, Carbohydrate Competence Center, Cancer Research UK Cambridge Center, Sport Wales, Crichton Carbon Centre, Canola Council of Canada, Bowel Cancer UK, Cameron (United States), Cameron (United Kingdom), Childhood Cancer Canada Foundation, Comprehensive Cancer Center Vienna, City College of Calamba, Clinton Community College, SUNY Corning Community College, Clackamas Community College, Centro de Química de Coimbra, IUCN Climate Crisis Commission"

# some orgs have more than one link, seems to be duplicates?
ror$ror_links %>% group_by(id) %>% count(links) %>% filter(n > 1) %>% ungroup()
#> # A tibble: 0 × 3
#> # ℹ 3 variables: id <chr>, links <chr>, n <int>
ror$ror_links %>% filter(id == "https://ror.org/01eea1w69") 
#> # A tibble: 1 × 2
#>   id                        links                          
#>   <chr>                     <chr>                          
#> 1 https://ror.org/01eea1w69 https://www.southernwater.co.uk

External identifiers


extids <- ror$ror_ids

# which external identifiers are most frequently provided?
# one given identifier can have multiple other identifiers esp FundRef
extids %>%
  group_by(name) %>%
  count() %>%
  arrange(desc(n))
#> # A tibble: 4 × 2
#> # Groups:   name [4]
#>   name          n
#>   <chr>     <int>
#> 1 GRID     113119
#> 2 ISNI      55891
#> 3 Wikidata  53254
#> 4 FundRef   20466

# which WikiData identifiers does a set of orgs have?
ids_wikidata <- 
  extids %>% filter(
  id %in% c("https://ror.org/052gg0110", "https://ror.org/013meh722"),
  name == "Wikidata") %>%
  distinct() %>%
  pull("value")

knitr::kable(escape = FALSE, tibble(url = sprintf("<a href='https://www.wikidata.org/wiki/%s'>%s</a>", ids_wikidata, ids_wikidata)))

url
Q35794
Q181892
Q24679079
Q10899168
Q34433
Q5260389
Q56612600
Q7529574
Q6786826
Q1095537