Skip to content

Commit c4a9f83

Browse files
authored
Optionally download files to EBS rather than use S3FS (#116)
* First attempt at downloading files * fix string reading of metadata * remove files when you're done with them * my code runs too fast * use fields * make channels into a list * make directory for files to land in * port to python2
1 parent 82dc8c6 commit c4a9f83

File tree

6 files changed

+131
-5
lines changed

6 files changed

+131
-5
lines changed

config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
MACHINE_TYPE = ['m4.xlarge']
1919
MACHINE_PRICE = 0.10
2020
EBS_VOL_SIZE = 30 # In GB. Minimum allowed is 22.
21+
DOWNLOAD_FILES = 'False'
2122

2223
# DOCKER INSTANCE RUNNING ENVIRONMENT:
2324
DOCKER_CORES = 1 # Number of CellProfiler processes to run inside a docker container

python2worker/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ RUN \
4848
RUN \
4949
pip install watchtower==0.8.0
5050

51+
# Install pandas for optional file downloading
52+
53+
RUN pip install pandas
54+
5155
# SETUP NEW ENTRYPOINT
5256

5357
RUN mkdir -p /home/ubuntu/

python2worker/cp-worker.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@
3535
NECESSARY_STRING = False
3636
else:
3737
NECESSARY_STRING = os.environ['NECESSARY_STRING']
38+
if 'DOWNLOAD_FILES' not in os.environ:
39+
DOWNLOAD_FILES = False
40+
else:
41+
DOWNLOAD_FILES = os.environ['DOWNLOAD_FILES']
42+
43+
localIn = '/home/ubuntu/local_input'
44+
3845

3946
#################################
4047
# CLASS TO HANDLE THE SQS QUEUE
@@ -159,8 +166,54 @@ def runCellProfiler(message):
159166
logger.removeHandler(watchtowerlogger)
160167
return 'SUCCESS'
161168
except KeyError: #Returned if that folder does not exist
162-
pass
163-
169+
pass
170+
171+
csv_name = os.path.join(DATA_ROOT,message['data_file'])
172+
173+
# Optional- download files
174+
if DOWNLOAD_FILES:
175+
if DOWNLOAD_FILES.lower() == 'true':
176+
printandlog('Figuring which files to download', logger)
177+
import pandas
178+
s3 = boto3.resource('s3')
179+
if not os.path.exists(localIn):
180+
os.mkdir(localIn)
181+
csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file']))
182+
csv_in=csv_in.astype('str')
183+
#Figure out what metadata fields we need in this experiment, as a dict
184+
if type(message['Metadata'])==dict:
185+
filter_dict = message['Metadata']
186+
else:
187+
filter_dict = {}
188+
for eachMetadata in message['Metadata'].split(','):
189+
filterkey, filterval = eachMetadata.split('=')
190+
filter_dict[filterkey] = filterval
191+
#Filter our CSV to just the rows CellProfiler will process, so that we can download only what we need
192+
for eachfilter in filter_dict.keys():
193+
csv_in = csv_in[csv_in[eachfilter] == filter_dict[eachfilter]]
194+
#Figure out the actual file names and get them
195+
channel_list = [x.split('FileName_')[1] for x in csv_in.columns if 'FileName' in x]
196+
count = 0
197+
printandlog('Downloading files', logger)
198+
for channel in channel_list:
199+
for field in range(csv_in.shape[0]):
200+
full_old_file_name = os.path.join(list(csv_in['PathName_'+channel])[field],list(csv_in['FileName_'+channel])[field])
201+
prefix_on_bucket = full_old_file_name.split(DATA_ROOT)[1][1:]
202+
new_file_name = os.path.join(localIn,prefix_on_bucket)
203+
if not os.path.exists(os.path.split(new_file_name)[0]):
204+
os.makedirs(os.path.split(new_file_name)[0])
205+
printandlog('made directory '+os.path.split(new_file_name)[0],logger)
206+
s3.meta.client.download_file(AWS_BUCKET,prefix_on_bucket,new_file_name)
207+
count +=1
208+
printandlog('Downloaded '+str(count)+' files',logger)
209+
local_csv_name = os.path.join(localIn,os.path.split(csv_name)[1])
210+
if not os.path.exists(local_csv_name):
211+
csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file']))
212+
csv_in.replace(DATA_ROOT,localIn,regex=True, inplace=True)
213+
csv_in.to_csv(local_csv_name,index=False)
214+
print('Wrote updated CSV')
215+
csv_name = local_csv_name
216+
164217
# Build and run CellProfiler command
165218
cpDone = localOut + '/cp.is.done'
166219
cp2 = False
@@ -173,7 +226,7 @@ def runCellProfiler(message):
173226
cmdstem = 'cellprofiler -c -r '
174227
if message['pipeline'][-3:]!='.h5':
175228
cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone
176-
cmd += ' --data-file=%(DATA)s/%(FL)s '
229+
cmd += ' --data-file='+csv_name+' '
177230
cmd += '-g %(Metadata)s'
178231
else:
179232
cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone + ' -g %(Metadata)s'
@@ -189,6 +242,9 @@ def runCellProfiler(message):
189242
# Get the outputs and move them to S3
190243
if os.path.isfile(cpDone):
191244
time.sleep(30)
245+
if os.path.exists(localIn):
246+
import shutil
247+
shutil.rmtree(localIn, ignore_errors=True)
192248
mvtries=0
193249
while mvtries <3:
194250
try:

run.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,11 @@ def generate_task_definition(AWS_PROFILE):
126126
{
127127
"name": "NECESSARY_STRING",
128128
"value": NECESSARY_STRING
129-
}
129+
},
130+
{
131+
"name": "DOWNLOAD_FILES",
132+
"value": DOWNLOAD_FILES
133+
}
130134
]
131135
return task_definition
132136

@@ -440,6 +444,7 @@ def startCluster():
440444

441445
# Step 6: Monitor the creation of the instances until all are present
442446
status = ec2client.describe_spot_fleet_instances(SpotFleetRequestId=requestInfo['SpotFleetRequestId'])
447+
time.sleep(15) # This is now too fast, so sometimes the spot fleet request history throws an error!
443448
while len(status['ActiveInstances']) < CLUSTER_MACHINES:
444449
# First check to make sure there's not a problem
445450
errorcheck = ec2client.describe_spot_fleet_request_history(SpotFleetRequestId=requestInfo['SpotFleetRequestId'], EventType='error', StartTime=thistime)

worker/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ RUN python3.8 -m pip install boto3
4545

4646
RUN python3.8 -m pip install watchtower
4747

48+
# Install pandas for optional file downloading
49+
50+
RUN python3.8 -m pip install pandas
51+
4852
# SETUP NEW ENTRYPOINT
4953

5054
RUN mkdir -p /home/ubuntu/

worker/cp-worker.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@
3535
NECESSARY_STRING = False
3636
else:
3737
NECESSARY_STRING = os.environ['NECESSARY_STRING']
38+
if 'DOWNLOAD_FILES' not in os.environ:
39+
DOWNLOAD_FILES = False
40+
else:
41+
DOWNLOAD_FILES = os.environ['DOWNLOAD_FILES']
42+
43+
localIn = '/home/ubuntu/local_input'
44+
3845

3946
#################################
4047
# CLASS TO HANDLE THE SQS QUEUE
@@ -161,12 +168,58 @@ def runCellProfiler(message):
161168
except KeyError: #Returned if that folder does not exist
162169
pass
163170

171+
csv_name = os.path.join(DATA_ROOT,message['data_file'])
172+
173+
# Optional- download files
174+
if DOWNLOAD_FILES:
175+
if DOWNLOAD_FILES.lower() == 'true':
176+
printandlog('Figuring which files to download', logger)
177+
import pandas
178+
s3 = boto3.resource('s3')
179+
if not os.path.exists(localIn):
180+
os.mkdir(localIn)
181+
csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file']))
182+
csv_in=csv_in.astype('str')
183+
#Figure out what metadata fields we need in this experiment, as a dict
184+
if type(message['Metadata'])==dict:
185+
filter_dict = message['Metadata']
186+
else:
187+
filter_dict = {}
188+
for eachMetadata in message['Metadata'].split(','):
189+
filterkey, filterval = eachMetadata.split('=')
190+
filter_dict[filterkey] = filterval
191+
#Filter our CSV to just the rows CellProfiler will process, so that we can download only what we need
192+
for eachfilter in filter_dict.keys():
193+
csv_in = csv_in[csv_in[eachfilter] == filter_dict[eachfilter]]
194+
#Figure out the actual file names and get them
195+
channel_list = [x.split('FileName_')[1] for x in csv_in.columns if 'FileName' in x]
196+
count = 0
197+
printandlog('Downloading files', logger)
198+
for channel in channel_list:
199+
for field in range(csv_in.shape[0]):
200+
full_old_file_name = os.path.join(list(csv_in['PathName_'+channel])[field],list(csv_in['FileName_'+channel])[field])
201+
prefix_on_bucket = full_old_file_name.split(DATA_ROOT)[1][1:]
202+
new_file_name = os.path.join(localIn,prefix_on_bucket)
203+
if not os.path.exists(os.path.split(new_file_name)[0]):
204+
os.makedirs(os.path.split(new_file_name)[0])
205+
printandlog('made directory '+os.path.split(new_file_name)[0],logger)
206+
s3.meta.client.download_file(AWS_BUCKET,prefix_on_bucket,new_file_name)
207+
count +=1
208+
printandlog('Downloaded '+str(count)+' files',logger)
209+
local_csv_name = os.path.join(localIn,os.path.split(csv_name)[1])
210+
if not os.path.exists(local_csv_name):
211+
csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file']))
212+
csv_in.replace(DATA_ROOT,localIn,regex=True, inplace=True)
213+
csv_in.to_csv(local_csv_name,index=False)
214+
print('Wrote updated CSV')
215+
csv_name = local_csv_name
216+
164217
# Build and run CellProfiler command
165218
cpDone = localOut + '/cp.is.done'
166219
cmdstem = 'cellprofiler -c -r '
167220
if message['pipeline'][-3:]!='.h5':
168221
cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone
169-
cmd += ' --data-file=%(DATA)s/%(FL)s '
222+
cmd += ' --data-file='+csv_name+' '
170223
cmd += '-g %(Metadata)s'
171224
else:
172225
cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone + ' -g %(Metadata)s'
@@ -182,6 +235,9 @@ def runCellProfiler(message):
182235
# Get the outputs and move them to S3
183236
if os.path.isfile(cpDone):
184237
time.sleep(30)
238+
if os.path.exists(localIn):
239+
import shutil
240+
shutil.rmtree(localIn, ignore_errors=True)
185241
mvtries=0
186242
while mvtries <3:
187243
try:

0 commit comments

Comments
 (0)