-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
135 lines (114 loc) · 4.57 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from nbconvert.preprocessors import Preprocessor
import re
import ast
import markdown
class MetaDataExtractionFailure(Exception):
pass
class Metadata(Preprocessor):
"""Preprocessor to extract metadata from first cell of notebook."""
data = {}
md = None
summary_cell = False
# Regex for 'key: value' syntax
key_value_regex = re.compile(
r'^\s*[*+-]?\s*(?P<key>[a-zA-Z]+)\s*:\s*(?P<value>.*)$')
@staticmethod
def extract_cell_metadata(cell):
"""Extract metadata from the given notebook cell source."""
# Convert Markdown title syntax to 'title:'
cell = re.sub(r'^#+\s*', 'title: ', cell, flags=re.MULTILINE)
# Extract metadata from key-value pairs in non-empty lines
lines = [line.strip() for line in cell.split('\n') if line.strip()]
metadata = {}
for line in lines:
match = Metadata.key_value_regex.match(line)
if not match:
raise MetaDataExtractionFailure(
'Failed to extract metadata with {l!r}'.format(l=line))
key, value = match.group('key', 'value')
metadata[key.lower()] = value.strip()
return metadata
@staticmethod
def preprocess(nb, resources):
'''Process the notebook to extract metadata'''
try:
Metadata.data = Metadata.extract_cell_metadata(nb.cells[0]['source'])
nb.cells = nb.cells[1:]
if not nb.cells:
raise Exception('No content cells after metadata extraction!')
except MetaDataExtractionFailure:
Metadata.data = {'status': 'draft'}
if 'summary' in Metadata.data:
Metadata.data['summary'] = Metadata.md.convert(
Metadata.data['summary'])
if Metadata.summary_cell and 'summarycell' not in Metadata.data:
Metadata.data['summarycell'] = 1
if 'summarycell' in Metadata.data :
s = Metadata.data['summarycell']
try:
cell_id = int(s) if s else 1
except ValueError as e:
print(e)
print("Using first cell as summary")
cell_id = 1
# Use content of the second cell as summary
if cell_id > 0 and nb.cells[cell_id-1].cell_type == "markdown":
Metadata.data['summary'] = Metadata.md.convert(
nb.cells[cell_id-1].source)
return nb, resources
class SubCells(Preprocessor):
"""A preprocessor to select a slice of the cells of a notebook"""
start = 0
end = None
@staticmethod
def preprocess(nb, resources):
'''Get start/end from subcells metadata'''
if 'subcells' in Metadata.data:
SubCells.start, SubCells.end = \
ast.literal_eval(Metadata.data['subcells'])
nb.cells = nb.cells[SubCells.start:SubCells.end]
if not nb.cells:
raise Exception('No content cells after SubCells!')
return nb, resources
class RemoveEmpty(Preprocessor):
'''Remove Empty Cells'''
visible = re.compile('\S')
@staticmethod
def preprocess(nb, resources):
nb.cells = [cell for cell in nb.cells
if re.search(RemoveEmpty.visible, cell['source'])]
if not nb.cells:
raise Exception('No content cells after RemoveEmpty!')
return nb, resources
class IgnoreTag(Preprocessor):
'''Ignore Cells with #ignore tag in the beginning'''
@staticmethod
def preprocess(nb, resources):
nb.cells = [cell for cell in nb.cells
if not cell['source'].startswith('#ignore')]
if not nb.cells:
raise Exception('No content cells after IgnoreTag!')
return nb, resources
pres = [('IPYNB_SUBCELLS', SubCells),
('IPYNB_IGNORE', IgnoreTag),
('IPYNB_REMOVE_EMPTY', RemoveEmpty), ]
default_options = {'IPYNB_REMOVE_EMPTY': True,
'IPYNB_IGNORE': True,
'IPYNB_SUBCELLS': True,
'IPYNB_SUMMARY_CELL': False,
}
def config_pres(setting):
'''Configuration of preprocess
Precedence: Metadata > SubCells > IgnoreTag = RemoveEmpty
Refresh preprocessor options by setting'''
Metadata.md = markdown.Markdown(**setting['MARKDOWN'])
preprocessors = [Metadata]
options = default_options.copy()
for key in options.keys():
if key in setting:
options[key] = setting[key]
Metadata.summary_cell = options['IPYNB_SUMMARY_CELL']
for opt, pre in pres:
if options[opt]:
preprocessors.append(pre)
return preprocessors