Skip to content

Commit fc5acda

Browse files
authored
Merge pull request #114 from billsacks/fix_large_output_hang
Fix hangs when a subcommand has a lot of output. Rework execute_subprocess timeout handling to prevent hangs The previous implementation, relying on the lower-level subprocess.Popen, was hanging when trying to get the status of a git repository with many changed files (see #113). It seems that the problem was that subprocess.Popen buffers output, and we would need to write some extra code to periodically clear the output buffer. This seemed tricky to get right, so I have backed up to using the higher-level subprocess.check_output, as we did prior to #81. I am using a new method to detect hangs and print a useful message (which was the point of PR #81), based on threading.Timer. I can't figure out a way to kill the subprocess when it appears to be hanging, but at least we can notify the user of this possible problem. This commit brings this part of the code back to how it was prior to d6423c6, except now we have a hang-detection timer. As a side-benefit, it appears that this change speeds up the unit and system tests, based on a few trial runs before and after this change. (This is probably due to the removal of the 0.02 sec delay associated with the previous subprocess.Popen implementation.) User interface changes?: No Fixes #113 (Timeout when getting status from a git repository with many changes) Testing: test removed: none unit tests: pass system tests: pass manual testing: (1) Tested the problem reported in #113 by putting a ton of changes in components/clm (changing permissions on all files, and adding about 7000 files in a testmods directory). The original hung, this version succeeded quickly. (2) Tested the problem reported in #79 and fixed in #81 (because this PR undoes the changes from #81): I tested with authentication required (on hobart, by pointing to the NGEET fates repo): I changed the timeout to 10 seconds, ran 'manage_externals/checkout_externals clm', and reviewed the printed message. It looks like this, after about 10 seconds: Checking out externals: fates, Username for 'https://github.com': Command 'git clone --quiet https://github.com/NGEET/fates fates' from directory /scratch/cluster/sacks/cesm/components/clm/src has taken 10 seconds. It may be hanging. The command will continue to run, but you may want to abort manage_externals with ^C and investigate. A possible cause of hangs is when svn or git require authentication to access a private repository. On some systems, svn and git requests for authentication information will not be displayed to the user. In this case, the program will appear to hang. Ensure you can run svn and git manually and access all repositories without entering your authentication information.
2 parents b0b23a6 + aa2eb71 commit fc5acda

File tree

5 files changed

+59
-94
lines changed

5 files changed

+59
-94
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ matrix:
1717
# NOTE(bja, 2017-11) update is slow, 2.7.12 installed by default, good enough!
1818
# - brew update
1919
# - brew outdated python2 || brew upgrade python2
20+
- pip install virtualenv
2021
- virtualenv env -p python2
2122
- source env/bin/activate
2223
install:
@@ -29,4 +30,3 @@ script:
2930
after_success:
3031
- cd test; make coverage
3132
- cd test; coveralls
32-

manic/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Public API for the manage_externals library
22
"""
33

4-
import manic.checkout as checkout
4+
from manic import checkout
55
from manic.utils import printlog
66

77
__all__ = [

manic/repository_git.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ def checkout(self, base_dir_path, repo_dir_name, verbosity):
5656
"""
5757
repo_dir_path = os.path.join(base_dir_path, repo_dir_name)
5858
repo_dir_exists = os.path.exists(repo_dir_path)
59-
if (repo_dir_exists and not os.listdir(repo_dir_path)) or not repo_dir_exists:
59+
if (repo_dir_exists and not os.listdir(
60+
repo_dir_path)) or not repo_dir_exists:
6061
self._clone_repo(base_dir_path, repo_dir_name, verbosity)
6162
self._checkout_ref(repo_dir_path, verbosity)
6263

manic/utils.py

Lines changed: 54 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import os
1313
import subprocess
1414
import sys
15-
import time
15+
from threading import Timer
1616

1717
from .global_constants import LOCAL_PATH_INDICATOR
1818

@@ -65,13 +65,15 @@ def last_n_lines(the_string, n_lines, truncation_message=None):
6565

6666
lines = the_string.splitlines(True)
6767
if len(lines) <= n_lines:
68-
return the_string
68+
return_val = the_string
6969
else:
7070
lines_subset = lines[-n_lines:]
7171
str_truncated = ''.join(lines_subset)
7272
if truncation_message:
7373
str_truncated = truncation_message + '\n' + str_truncated
74-
return str_truncated
74+
return_val = str_truncated
75+
76+
return return_val
7577

7678

7779
def indent_string(the_string, indent_level):
@@ -119,9 +121,9 @@ def str_to_bool(bool_str):
119121
"""
120122
value = None
121123
str_lower = bool_str.lower()
122-
if (str_lower == 'true') or (str_lower == 't'):
124+
if str_lower in ('true', 't'):
123125
value = True
124-
elif (str_lower == 'false') or (str_lower == 'f'):
126+
elif str_lower in ('false', 'f'):
125127
value = False
126128
if value is None:
127129
msg = ('ERROR: invalid boolean string value "{0}". '
@@ -199,80 +201,30 @@ def expand_local_url(url, field):
199201
# subprocess
200202
#
201203
# ---------------------------------------------------------------------
202-
_TIMEOUT_MSG = """ Timout errors typically occur when svn or git requires
203-
authentication to access a private repository. On some systems, svn
204-
and git requests for authentication information will not be displayed
205-
to the user. In this case, the program will appear to hang and
206-
generate a timeout error. Ensure you can run svn and git manually and
207-
access all repositories without entering your authentication
208-
information."""
209-
210-
_TIMEOUT_SEC = 300
211-
_POLL_DELTA_SEC = 0.02
212-
213-
214-
def _poll_subprocess(commands, status_to_caller, output_to_caller,
215-
timeout_sec=_TIMEOUT_SEC):
216-
"""Create a subprocess and poll the process until complete.
217204

218-
Impose a timeout limit and checkout process output for known
219-
conditions that require user interaction.
205+
# Give the user a helpful message if we detect that a command seems to
206+
# be hanging.
207+
_HANGING_SEC = 300
220208

221-
NOTE: the timeout_delta has significant impact on run time. If it
222-
is too long, and the many quick local subprocess calls will
223-
drastically increase the run time, especially in tests.
224209

225-
NOTE: This function is broken out into for ease of
226-
understanding. It does no error checking. It should only be called
227-
from execute_subprocess, never directly.
210+
def _hanging_msg(working_directory, command):
211+
print("""
228212
229-
"""
230-
logging.info(' '.join(commands))
231-
output = []
232-
start = time.time()
233-
234-
proc = subprocess.Popen(commands,
235-
shell=False,
236-
stdout=subprocess.PIPE,
237-
stderr=subprocess.STDOUT,
238-
universal_newlines=True)
239-
while proc.poll() is None:
240-
time.sleep(_POLL_DELTA_SEC)
241-
if time.time() - start > timeout_sec:
242-
proc.kill()
243-
time.sleep(_POLL_DELTA_SEC * 5)
244-
msg = ("subprocess call to '{0}' has exceeded timeout limit of "
245-
"{1} seconds.\n{2}".format(commands[0], timeout_sec,
246-
_TIMEOUT_MSG))
247-
fatal_error(msg)
248-
finish = time.time()
213+
Command '{command}'
214+
from directory {working_directory}
215+
has taken {hanging_sec} seconds. It may be hanging.
249216
250-
run_time_msg = "run time : {0:.2f} seconds".format(finish - start)
251-
logging.info(run_time_msg)
252-
output = proc.stdout.read()
253-
log_process_output(output)
254-
status = proc.returncode
217+
The command will continue to run, but you may want to abort
218+
manage_externals with ^C and investigate. A possible cause of hangs is
219+
when svn or git require authentication to access a private
220+
repository. On some systems, svn and git requests for authentication
221+
information will not be displayed to the user. In this case, the program
222+
will appear to hang. Ensure you can run svn and git manually and access
223+
all repositories without entering your authentication information.
255224
256-
# NOTE(bja, 2018-03) need to cleanup open files. In python3 use
257-
# "with subprocess.Popen(...) as proc:", but that is not available
258-
# with python2 unless we create a context manager.
259-
proc.stdout.close()
260-
261-
if status != 0:
262-
raise subprocess.CalledProcessError(returncode=status,
263-
cmd=commands,
264-
output=output)
265-
266-
if status_to_caller and output_to_caller:
267-
ret_value = (status, output)
268-
elif status_to_caller:
269-
ret_value = status
270-
elif output_to_caller:
271-
ret_value = output
272-
else:
273-
ret_value = None
274-
275-
return ret_value
225+
""".format(command=command,
226+
working_directory=working_directory,
227+
hanging_sec=_HANGING_SEC))
276228

277229

278230
def execute_subprocess(commands, status_to_caller=False,
@@ -288,19 +240,24 @@ def execute_subprocess(commands, status_to_caller=False,
288240
return code, otherwise execute_subprocess treats non-zero return
289241
status as an error and raises an exception.
290242
291-
NOTE(bja, 2018-03) we are polling the running process to avoid
292-
having it hang indefinitely if there is input that we don't
293-
detect. Some large checkouts are multiple minutes long. For now we
294-
are setting the timeout interval to five minutes.
295-
296243
"""
297-
msg = 'In directory: {0}\nexecute_subprocess running command:'.format(
298-
os.getcwd())
244+
cwd = os.getcwd()
245+
msg = 'In directory: {0}\nexecute_subprocess running command:'.format(cwd)
299246
logging.info(msg)
300-
logging.info(commands)
247+
commands_str = ' '.join(commands)
248+
logging.info(commands_str)
249+
return_to_caller = status_to_caller or output_to_caller
250+
status = -1
251+
output = ''
252+
hanging_timer = Timer(_HANGING_SEC, _hanging_msg,
253+
kwargs={"working_directory": cwd,
254+
"command": commands_str})
255+
hanging_timer.start()
301256
try:
302-
ret_value = _poll_subprocess(
303-
commands, status_to_caller, output_to_caller)
257+
output = subprocess.check_output(commands, stderr=subprocess.STDOUT,
258+
universal_newlines=True)
259+
log_process_output(output)
260+
status = 0
304261
except OSError as error:
305262
msg = failed_command_msg(
306263
'Command execution failed. Does the executable exist?',
@@ -319,20 +276,27 @@ def execute_subprocess(commands, status_to_caller=False,
319276
# simple status check. If returning, it is the callers
320277
# responsibility determine if an error occurred and handle it
321278
# appropriately.
322-
if status_to_caller and output_to_caller:
323-
ret_value = (error.returncode, error.output)
324-
elif status_to_caller:
325-
ret_value = error.returncode
326-
elif output_to_caller:
327-
ret_value = error.output
328-
else:
279+
if not return_to_caller:
329280
msg_context = ('Process did not run successfully; '
330281
'returned status {0}'.format(error.returncode))
331282
msg = failed_command_msg(msg_context, commands,
332283
output=error.output)
333284
logging.error(error)
285+
logging.error(msg)
334286
log_process_output(error.output)
335287
fatal_error(msg)
288+
status = error.returncode
289+
finally:
290+
hanging_timer.cancel()
291+
292+
if status_to_caller and output_to_caller:
293+
ret_value = (status, output)
294+
elif status_to_caller:
295+
ret_value = status
296+
elif output_to_caller:
297+
ret_value = output
298+
else:
299+
ret_value = None
336300

337301
return ret_value
338302

test/.pylint.rc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ confidence=
5050
# --enable=similarities". If you want to run only the classes checker, but have
5151
# no Warning level messages displayed, use"--disable=all --enable=classes
5252
# --disable=W"
53-
disable=bad-continuation
53+
disable=bad-continuation,useless-object-inheritance
5454

5555

5656
# Enable the message, report, category or checker with the given id(s). You can

0 commit comments

Comments
 (0)