Skip to content

Commit 07a327c

Browse files
authored
Merge pull request #2228 from broadinstitute/jb-de-cluster-name-ref
Make updating existing DE result more robust (SCP-5952)
2 parents b332adb + 920fedc commit 07a327c

8 files changed

+144
-24
lines changed

app/lib/differential_expression_service.rb

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_
134134
metadata_url = study_file.is_viz_anndata? ?
135135
RequestUtils.data_fragment_url(study_file, 'metadata') :
136136
study.metadata_file.gs_url
137+
cluster_group_id = cluster_group.id
137138
# begin assembling parameters
138139
de_params = {
139140
annotation_name:,
@@ -143,10 +144,12 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_
143144
group2:,
144145
annotation_file: annotation_scope == 'cluster' ? cluster_url : metadata_url,
145146
cluster_file: cluster_url,
146-
cluster_name: cluster_group.name
147+
cluster_name: set_cluster_name(study, cluster_group, annotation_name, annotation_scope),
148+
cluster_group_id:
147149
}
148150
raw_matrix = ClusterVizService.raw_matrix_for_cluster_cells(study, cluster_group)
149151
de_params[:matrix_file_path] = raw_matrix.gs_url
152+
de_params[:matrix_file_id] = raw_matrix.id
150153
if raw_matrix.file_type == 'MM Coordinate Matrix'
151154
de_params[:matrix_file_type] = 'mtx'
152155
# we know bundle exists and is completed as :raw_matrix_for_cluster_cells will throw an exception if it isn't
@@ -287,6 +290,21 @@ def self.annotation_eligible?(name)
287290
ALLOWED_ANNOTS =~ name && EXCLUDED_ANNOTS !~ name
288291
end
289292

293+
# determine if the requested cluster/annotation has an existing result object
294+
# used for setting cluster name or in pairwise DE when merging in new results
295+
#
296+
# * *params*
297+
# - +study+ (Study) => Study in which results exist
298+
# - +cluster_group+ (ClusterGroup) => Clustering object to source name/file from
299+
# - +annotation_name+ (String) => Name of requested annotation
300+
# - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster')
301+
#
302+
# * *returns*
303+
# - (DifferentialExpressionResult, nil)
304+
def self.find_existing_result(study, cluster_group, annotation_name, annotation_scope)
305+
DifferentialExpressionResult.find_by(study:, cluster_group:, annotation_name:, annotation_scope:)
306+
end
307+
290308
# determine if a study already has DE results for an annotation, taking scope into account
291309
# cluster-based annotations must match to the specified cluster in the annotation object
292310
# for study-wide annotations, return true if any results exist, regardless of cluster as this indicates that DE
@@ -309,6 +327,21 @@ def self.results_exist?(study, annotation)
309327
).exists?
310328
end
311329

330+
# helper to set cluster_name when existing results are present
331+
# this prevents result file URLs breaking because the cluster has been renamed at some point
332+
#
333+
# * *params*
334+
# - +study+ (Study) => Study in which results exist
335+
# - +cluster_group+ (ClusterGroup) => Clustering object to source name/file from
336+
# - +annotation_name+ (String) => Name of requested annotation
337+
# - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster')
338+
#
339+
# * *returns*
340+
# - (String)
341+
def self.set_cluster_name(study, cluster_group, annotation_name, annotation_scope)
342+
find_existing_result(study, cluster_group, annotation_name, annotation_scope)&.cluster_name || cluster_group.name
343+
end
344+
312345
# determine if a study meets the requirements for differential expression:
313346
# 1. public
314347
# 2. has clustering/metadata
@@ -383,12 +416,15 @@ def self.validate_annotation(cluster_group, study, annotation_name, annotation_s
383416
elsif pairwise
384417
missing = [group1, group2] - annotation[:values]
385418
raise ArgumentError, "#{annotation_name} does not contain '#{missing.join(', ')}'" if missing.any?
419+
386420
cell_count = {
387-
"#{group1}" => cells_by_label[group1].count,
388-
"#{group2}" => cells_by_label[group2].count
421+
group1.to_s => cells_by_label[group1]&.count.to_i,
422+
group2.to_s => cells_by_label[group2]&.count.to_i
389423
}.keep_if { |_, c| c < 2 }
390-
raise ArgumentError,
391-
"#{cell_count.keys.join(', ')} does not have enough cells represented in #{identifier}" if cell_count.any?
424+
if cell_count.any?
425+
raise ArgumentError, "#{cell_count.keys.join(', ')} does not have enough cells represented in #{identifier} " \
426+
"for #{cluster_group.name}"
427+
end
392428
end
393429
end
394430

app/models/differential_expression_parameters.rb

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ class DifferentialExpressionParameters
1717
# annotation_scope: scope of annotation (study, cluster)
1818
# annotation_file: source file for above annotation
1919
# cluster_file: clustering file with cells to use as control list for DE
20-
# cluster_name: name of associated ClusterGroup object
20+
# cluster_name: name of associated ClusterGroup object for use in output filenames
21+
# cluster_group_id: BSON ID of ClusterGroup object for associations
2122
# de_type: type of differential expression calculation: 'rest' (one-vs-rest) or 'pairwise'
2223
# group1: first group for pairwise comparison
2324
# group2: second group for pairwise comparison
2425
# matrix_file_path: raw counts matrix with source expression data
2526
# matrix_file_type: type of raw counts matrix (dense, sparse)
27+
# matrix_file_id: BSON ID of raw matrix StudyFile for associations
2628
# gene_file (optional): genes/features file for sparse matrix
2729
# barcode_file (optional): barcodes file for sparse matrix
2830
# machine_type (optional): override for default ingest machine type (uses 'n2d-highmem-8')
@@ -34,24 +36,26 @@ class DifferentialExpressionParameters
3436
annotation_file: nil,
3537
cluster_file: nil,
3638
cluster_name: nil,
39+
cluster_group_id: nil,
3740
de_type: 'rest',
3841
group1: nil,
3942
group2: nil,
4043
matrix_file_path: nil,
4144
matrix_file_type: nil,
45+
matrix_file_id: nil,
4246
gene_file: nil,
4347
barcode_file: nil,
4448
machine_type: 'n2d-highmem-8',
4549
file_size: 0
4650
}.freeze
4751

4852
# values that are available as methods but not as attributes (and not passed to command line)
49-
NON_ATTRIBUTE_PARAMS = %i[machine_type file_size].freeze
53+
NON_ATTRIBUTE_PARAMS = %i[machine_type file_size cluster_group_id matrix_file_id].freeze
5054

5155
attr_accessor(*PARAM_DEFAULTS.keys)
5256

53-
validates :annotation_name, :annotation_scope, :annotation_file, :cluster_file,
54-
:cluster_name, :matrix_file_path, :matrix_file_type, presence: true
57+
validates :annotation_name, :annotation_scope, :annotation_file, :cluster_file, :cluster_name, :cluster_group_id,
58+
:matrix_file_path, :matrix_file_type, :matrix_file_id, presence: true
5559
validates :annotation_file, :cluster_file, :matrix_file_path,
5660
format: { with: Parameterizable::GS_URL_REGEXP, message: 'is not a valid GS url' }
5761
validates :annotation_scope, inclusion: %w[cluster study]
@@ -75,4 +79,12 @@ def initialize(attributes = nil)
7579
self.machine_type = assign_machine_type
7680
end
7781
end
82+
83+
def cluster_group
84+
ClusterGroup.find(cluster_group_id)
85+
end
86+
87+
def matrix_file
88+
StudyFile.find(matrix_file_id)
89+
end
7890
end

app/models/differential_expression_result.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def set_automated_comparisons(group1: nil, group2: nil)
203203
private
204204

205205
def set_cluster_name
206-
self.cluster_name = cluster_group.name
206+
self.cluster_name = cluster_group.name unless cluster_name
207207
end
208208

209209
def comparisons_available?

app/models/ingest_job.rb

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -738,14 +738,13 @@ def launch_differential_expression_jobs
738738
# set corresponding differential expression flags on associated annotation
739739
def create_differential_expression_results
740740
annotation_identifier = "#{params_object.annotation_name}--group--#{params_object.annotation_scope}"
741-
Rails.logger.info "Creating differential expression result object for annotation: #{annotation_identifier}"
742-
cluster = ClusterGroup.find_by(study_id: study.id, study_file_id: study_file.id, name: params_object.cluster_name)
743-
matrix_url = params_object.matrix_file_path
744-
bucket_path = matrix_url.split("gs://#{study.bucket_id}/").last
745-
matrix_file = StudyFile.where(
746-
study:, :upload_file_name.in => [bucket_path, bucket_path.split('/').last]
747-
).detect(&:is_raw_counts_file?)
748-
de_result = DifferentialExpressionResult.find_or_initialize_by(
741+
cluster = params_object.cluster_group
742+
matrix_file = params_object.matrix_file
743+
Rails.logger.info "Creating differential expression result object for #{annotation_identifier} " \
744+
"(cluster: #{cluster.name} in #{study.accession})"
745+
de_result = DifferentialExpressionService.find_existing_result(
746+
study, cluster, params_object.annotation_name, params_object.annotation_scope
747+
) || DifferentialExpressionResult.new(
749748
study: study, cluster_group: cluster, cluster_name: cluster.name,
750749
annotation_name: params_object.annotation_name, annotation_scope: params_object.annotation_scope,
751750
matrix_file_id: matrix_file.id
@@ -1000,7 +999,7 @@ def get_job_analytics
1000999
)
10011000
end
10021001
when :differential_expression
1003-
cluster = ClusterGroup.find_by(study_id: study.id, study_file_id: study_file.id, name: params_object.cluster_name)
1002+
cluster = params_object.cluster_group
10041003
annotation_params = {
10051004
cluster: cluster,
10061005
annot_name: params_object.annotation_name,

test/models/batch_api_client_test.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,17 @@ class BatchApiClientTest < ActiveSupport::TestCase
117117
test 'should create and submit DE Batch API job' do
118118
action = :differential_expression
119119
bucket_dir = "_scp_internal/anndata_ingest/#{@ann_data_study.accession}_#{@ann_data_file.id}"
120+
cluster = @ann_data_study.cluster_groups.first
120121
anndata_options = {
121122
annotation_name: 'disease',
122123
annotation_scope: 'study',
123124
annotation_file: "gs://#{@ann_data_study.bucket_id}/#{bucket_dir}/h5ad_frag.metadata.tsv.gz",
124125
cluster_file: "gs://#{@ann_data_study.bucket_id}/#{bucket_dir}/h5ad_frag.cluster.X_umap.tsv.gz",
125126
cluster_name: 'umap',
127+
cluster_group_id: cluster.id,
126128
matrix_file_path: @ann_data_file.gs_url,
127129
matrix_file_type: 'h5ad',
130+
matrix_file_id: @ann_data_file.id,
128131
file_size: @ann_data_file.upload_file_size
129132
}
130133
params_object = DifferentialExpressionParameters.new(**anndata_options)

test/models/differential_expression_parameters_test.rb

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@
33
class DifferentialExpressionParametersTest < ActiveSupport::TestCase
44

55
before(:all) do
6+
cluster_group_id = BSON::ObjectId.new
7+
matrix_file_id = BSON::ObjectId.new
68
@dense_options = {
79
annotation_name: 'Category',
810
annotation_scope: 'cluster',
911
annotation_file: 'gs://test_bucket/metadata.tsv',
1012
cluster_file: 'gs://test_bucket/cluster.tsv',
1113
cluster_name: 'UMAP',
14+
cluster_group_id:,
1215
matrix_file_path: 'gs://test_bucket/dense.tsv',
13-
matrix_file_type: 'dense'
16+
matrix_file_type: 'dense',
17+
matrix_file_id:
1418
}
1519

1620
@sparse_options = {
@@ -19,8 +23,10 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
1923
annotation_file: 'gs://test_bucket/metadata.tsv',
2024
cluster_file: 'gs://test_bucket/cluster.tsv',
2125
cluster_name: 'UMAP',
26+
cluster_group_id:,
2227
matrix_file_path: 'gs://test_bucket/sparse.tsv',
2328
matrix_file_type: 'mtx',
29+
matrix_file_id:,
2430
gene_file: 'gs://test_bucket/genes.tsv',
2531
barcode_file: 'gs://test_bucket/barcodes.tsv'
2632
}
@@ -31,8 +37,10 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
3137
annotation_file: 'gs://test_bucket/metadata.tsv',
3238
cluster_file: 'gs://test_bucket/cluster.tsv',
3339
cluster_name: 'UMAP',
40+
cluster_group_id:,
3441
matrix_file_path: 'gs://test_bucket/matrix.h5ad',
3542
matrix_file_type: 'h5ad',
43+
matrix_file_id:,
3644
file_size: 10.gigabytes
3745
}
3846

@@ -45,8 +53,10 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
4553
annotation_file: 'gs://test_bucket/metadata.tsv',
4654
cluster_file: 'gs://test_bucket/cluster.tsv',
4755
cluster_name: 'UMAP',
56+
cluster_group_id:,
4857
matrix_file_path: 'gs://test_bucket/dense.tsv',
49-
matrix_file_type: 'dense'
58+
matrix_file_type: 'dense',
59+
matrix_file_id:
5060
}
5161
end
5262

@@ -144,4 +154,47 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
144154
dense_params = DifferentialExpressionParameters.new(@dense_options)
145155
assert_not_includes dense_params.attributes, :machine_type
146156
end
157+
158+
test 'should find cluster group and matrix file' do
159+
user = FactoryBot.create(:user, test_array: @@users_to_clean)
160+
study = FactoryBot.create(:detached_study,
161+
name_prefix: 'DifferentialExpressionParameters Test',
162+
user:,
163+
test_array: @@studies_to_clean)
164+
raw_matrix = FactoryBot.create(:expression_file,
165+
name: 'raw.txt',
166+
study:,
167+
expression_file_info: {
168+
is_raw_counts: true,
169+
units: 'raw counts',
170+
library_preparation_protocol: 'Drop-seq',
171+
biosample_input_type: 'Whole cell',
172+
modality: 'Proteomic'
173+
})
174+
175+
cells = %w[A B C D E]
176+
cluster_file = FactoryBot.create(:cluster_file,
177+
name: 'cluster_diffexp.tsv',
178+
study:,
179+
cell_input: {
180+
x: [1, 4, 6, 8, 9],
181+
y: [7, 5, 3, 2, 1],
182+
cells:
183+
})
184+
cluster_group = ClusterGroup.find_by(study:, study_file: cluster_file)
185+
params = {
186+
annotation_name: 'Category',
187+
annotation_scope: 'cluster',
188+
annotation_file: 'gs://test_bucket/metadata.tsv',
189+
cluster_file: 'gs://test_bucket/cluster_diffexp.tsv',
190+
cluster_name: 'cluster_diffexp.tsv',
191+
cluster_group_id: cluster_group.id,
192+
matrix_file_path: 'gs://test_bucket/dense.tsv',
193+
matrix_file_type: 'dense',
194+
matrix_file_id: raw_matrix.id
195+
}
196+
de_params = DifferentialExpressionParameters.new(params)
197+
assert_equal cluster_group, de_params.cluster_group
198+
assert_equal raw_matrix, de_params.matrix_file
199+
end
147200
end

test/models/ingest_job_test.rb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,8 @@ class IngestJobTest < ActiveSupport::TestCase
879879
cluster_file = "gs://#{bucket}/anndata/h5ad_frag.cluster.X_umap.tsv"
880880
params_object = DifferentialExpressionParameters.new(
881881
matrix_file_path: "gs://#{bucket}/matrix.h5ad", matrix_file_type: 'h5ad', file_size: study_file.upload_file_size,
882-
annotation_file:, cluster_file:, cluster_name: 'umap', annotation_name: 'disease', annotation_scope: 'study'
882+
annotation_file:, cluster_file:, cluster_name: 'umap', annotation_name: 'disease', annotation_scope: 'study',
883+
cluster_group_id: BSON::ObjectId.new, matrix_file_id: study_file.id
883884
)
884885
pipeline_name = SecureRandom.uuid
885886
study_file.update(parse_status: 'parsing')
@@ -1079,7 +1080,7 @@ class IngestJobTest < ActiveSupport::TestCase
10791080
# one vs rest test
10801081
one_vs_rest = DifferentialExpressionParameters.new(
10811082
annotation_name: 'cell_type__ontology_label', annotation_scope: 'study', cluster_name: 'cluster_diffexp.txt',
1082-
matrix_file_path: "gs://#{study.bucket_id}/raw.txt"
1083+
matrix_file_path: "gs://#{study.bucket_id}/raw.txt", cluster_group_id: cluster_group.id, matrix_file_id: matrix.id
10831084
)
10841085
job = IngestJob.new(study:, study_file: cluster_file, action: :differential_expression, params_object: one_vs_rest)
10851086
job.create_differential_expression_results
@@ -1094,7 +1095,8 @@ class IngestJobTest < ActiveSupport::TestCase
10941095
# pairwise test
10951096
pairwise = DifferentialExpressionParameters.new(
10961097
annotation_name: 'cell_type__ontology_label', annotation_scope: 'study', cluster_name: 'cluster_diffexp.txt',
1097-
matrix_file_path: "gs://#{study.bucket_id}/raw.txt", de_type: 'pairwise', group1: 'B cell', group2: 'T cell'
1098+
matrix_file_path: "gs://#{study.bucket_id}/raw.txt", de_type: 'pairwise', group1: 'B cell', group2: 'T cell',
1099+
cluster_group_id: cluster_group.id, matrix_file_id: matrix.id
10981100
)
10991101
job = IngestJob.new(study:, study_file: cluster_file, action: :differential_expression, params_object: pairwise)
11001102
job.create_differential_expression_results

test/services/differential_expression_service_test.rb

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,4 +445,19 @@ class DifferentialExpressionServiceTest < ActiveSupport::TestCase
445445
user.reload
446446
assert_equal 0, user.weekly_de_quota
447447
end
448+
449+
test 'should set cluster name' do
450+
annotation_name = 'cell_type__ontology_label'
451+
annotation_scope = 'study'
452+
assert_equal @cluster_group.name, DifferentialExpressionService.set_cluster_name(
453+
@basic_study, @cluster_group, annotation_name, annotation_scope
454+
)
455+
DifferentialExpressionResult.create(
456+
study: @basic_study, cluster_group: @cluster_group, cluster_name: 'umap', annotation_name:, annotation_scope:,
457+
matrix_file_id: @raw_matrix.id
458+
)
459+
assert_equal 'umap', DifferentialExpressionService.set_cluster_name(
460+
@basic_study, @cluster_group, annotation_name, annotation_scope
461+
)
462+
end
448463
end

0 commit comments

Comments
 (0)