scQUEST/data-loader-aml-annotated-cell-types.R at master · AI4SCR/scQUEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# load packages
pkgs = c('flowCore',
         'anndata',
         'tidyverse',
         'magrittr',
         'httr')
for(i in pkgs) suppressPackageStartupMessages(library(i, character.only = TRUE))

# define the python interpreter to use
reticulate::use_python('/usr/local/Caskroom/miniconda/base/envs/r-reticulate-env/bin/python')

# request for figshare api
project_id = 140062
articel_id = 19867114 # annotated, 19867021 healthy vs AML
PATH = '~/.scQUEST/Levine_CyTOF_AML_AnnotatedCellTypes/'
REQUEST = paste0('https://api.figshare.com/v2/articles/', articel_id, '/files')

# helpers
mapper = function(key, df){
  mapping= list()
  cols = colnames(df)
  cols = cols[cols != key]
  for(i in cols){
    m = map(df[[i]], ~ .x)
    names(m) = df[[key]]
    mapping[[i]] = m
  }
  return (mapping)
}

download_files = function(path = PATH, request = REQUEST){
  path = path.expand(path)
  dir.create(path, showWarnings = TRUE, recursive = TRUE, mode = "0777")

  resp = GET(request)
  stop_for_status(resp)

  files = content(resp, 'parsed')
  files = map(files, function(x){
    url = x$download_url
    fpath = paste0(path, x$name)
    if(!file.exists(fpath)){
      download.file(url, fpath)
      }
    return(c(url, fpath))
  })
}

# download files
files = download_files(PATH, REQUEST)
files = unlist(map(files, ~.x[2]))
files.fcs = files[grepl('.fcs$', files)]
files.csv = files[grepl('.csv$', files)]

files.sampleID = files.csv[grepl('sampleID', files.csv)]
files.channels = files.csv[grepl('channels', files.csv)]

# mappings
rename.sample = readr::read_csv(files.sampleID)
map.sample = mapper('sample_id', rename.sample)

rename.channel = readr::read_csv(files.channels)
map.channel = mapper('channel', rename.channel)

# load FCS file
X = NULL
OBS = NULL
VAR = NULL
fcs.header = list()

f.count = 0
for(f in files.fcs){
  f.count = f.count + 1
  f.base =  basename(f)
  cat(f.count, '/', length(files.fcs), ' reading in file ', f.base, '\n')

  header = read.FCSheader(f)
  header.tbl = tibble(keyword = names(header[[1]]), value=header[[1]])

  fcs <- read.FCS(f)
  x = exprs(fcs)

  obs = list(fcs_file = f.base)
  for(i in names(map.sample)){
    obs[[i]] = map.sample[[i]][[f.base]]
  }

  obs = do.call(tibble, obs)
  obs %<>% slice(rep(1, each = dim(x)[1]))

  var = parameters(fcs)@data %>% select(-range, -minRange, -maxRange)

  X = rbind(X,x)
  OBS = rbind(OBS, obs)

  if(is.null(VAR)) VAR = var
  else stopifnot(all(VAR == var))

  stopifnot(!f %in% names(fcs.header))
  fcs.header[[f.base]] = header.tbl
}

# tidy data
VAR %<>% rename(channel = name, marker = desc)
for(i in names(map.channel)){
 VAR %<>% mutate("{i}" := unlist(map.channel[[i]][channel]))
}

# create AnnData object
ad = AnnData(
  X = X,
  var = VAR,
  obs = OBS,
  uns = list(
    fcs_header = fcs.header
  )
)

path.output = path.expand(paste0(PATH, 'ad_annotated_cell_types.h5ad'))
anndata::write_h5ad(ad, path.output)