ocannl/ocannl_config.example at master · ahrefs/ocannl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# This file lists all the current configurations. It can be used as documentation
# or as a template for writing `ocannl_config` files. Config names must be prefixed
# by `ocannl_`, except in `ocannl_config` files where it is optional. The names are
# case-insensitive. The values are case-sensitive and should be lowercase where case
# is irrelevant. The configuration is read from a few sources, from highest priority:
# 1. Commandline arguments.
# 2. Environment variables.
# 3. `ocannl_config` files.
# 4. Defaults hard-coded at use sites in the source code.
#
# Only one `ocannl_config` file is used per run, searched for in the current directory
# and in ancestor directories. The source of the `log_level` config is always printed,
# the sources of other configs are printed when the log level > 0.
# The configuration values below are (one of) the defaults.
#
# Repeating fields are disallowed. If the value is an empty string, the default value
# is used (as if the config was not present).

# Configurations that are stored as `Utils.settings`:

# The log level, for ppx_minidebug and with a few other uses in OCANNL.
log_level=1

# If `log_level` is at least 2 and this is true, the generated code will contain
# printf statements, whose output is then (typically) integrated into ppx_minidebug logs.
debug_log_from_routines=false

# If true, various intermediate representation files for the compiled code are generated
# (or not removed). Moreover, if log level is at least 2, the generated binaries will
# contain debug symbols for debugging with `gdb`, `cuda-gdb` etc.
output_debug_files_in_build_directory=false

# If true, when printing Low_level.t values in human-readable syntax (.ll files),
# tensor node names will include their precision annotations (e.g., `n3<single>` instead of `n3`).
# This is useful for debugging precision-related issues in generated code.
output_prec_in_ll_files=false

# If given, the integer seed to initialize the randomness library with.
fixed_state_for_init=

# For printing tensors, etc.
print_decimals_precision=2

# Complains if a half-precision tensor node is a constant with absolute value exceeding this.
check_half_prec_constants_cutoff=16384.0

# If true, [from_host] and [to_host] happen automatically in specific situations:
# - When a host array is about to be read, we transfer to host from the context that most
#   recently updated the node.
# - When a routine is about to be run, we transfer the routine's inputs from host to the
#   routine's context if the host array was not yet transfered since its creation or most
#   recent modification.
automatic_host_transfers=true

# If true, use uint64 for indexing arithmetic. If false (default), use uint32 for indexing arithmetic.
# This affects all backends' kernel index parameters and local index variables.
# Set to true for models with tensors larger than 2^32 elements.
big_models=false

# If true, the build_files directory is deleted on startup.
clean_up_build_files_on_startup=true

# If true, the log_files directory is deleted on startup.
clean_up_log_files_on_startup=true

# Other configurations:

# The default backend for computations, if `Backends.fresh_backend` isn't passed `backend_name`.
bacend=multicore_cc

# If true, stdout capturing is disabled, so some logs meant for the ppx_minidebug log files
# (in particular CUDA logs) remain on the stdout.
#
# NOTE: current implementation of stdout capture loses information on channel
# buffering overflows, so it is important to verify with this setting if one gets
# sufficient information in the logs files.
never_capture_stdout=false

# If `false`, debug logs from routines are re-logged via the ppx_minidebug framework.
# If `true`, the logs are in stream-specific files. WARNING: files are appended to, not overwritten.
debug_log_to_stream_files=false

# If set and relevant, it's the `CU_LIMIT_PRINTF_FIFO_SIZE` CUDA configuration.
cuda_printf_fifo_size=

# The `-O` argument to the compiler executable for the `multicore_cc` and `sync_cc` backends.
cc_backend_optimization_level=3

# The `multicore_cc` and `sync_cc` backends compiler executable name.
# By default, `ocamlc -config` field `c_compiler` is used.
cc_backend_compiler_command=

# Additional architecture-specific flags for the CC backend compiler invocation.
# Default `-march=native` targets the host CPU. Override for portability/cross-compilation.
# To disable arch-specific tuning from commandline, pass `--ocannl_cc_backend_arch_flags=`.
cc_backend_arch_flags=-march=native

# If true, enables `-ffast-math` for the CC backend. This may change numerical behavior.
cc_backend_fast_math=false

# If true, compiled dynamic libraries (.dll/.so files) for the CC backend are placed
# in the build_files directory using the source file path. If false, temporary files
# are used for the dynamic libraries (default behavior).
output_dlls_in_build_directory=false

# Per-tensor-node cutoff in bytes for considering off-stack allocation, opt-in for backends.
# Decrease it if you see crashes like `bus error`. Values smaller-or-equal 0 mean "no limits".
# Tensors exceeding this size will be allocated on the device's global memory.
stack_threshold_in_bytes=16384

# Only tensor nodes with up to this many visits per array cell (in a dedicated interpreter)
# can be inlined. Values worth considering: 0 (disables inlining) to 3.
virtualize_max_visits=1

# Truncate longer axes to this many dimensions in the generic optimizer's interpreter.
virtualize_max_tracing_dim=5

# If `true`, tensor nodes will by default not be hosted.
enable_device_only=true

# If true, scalar constant expressions will always be inlined.
inline_scalar_constexprs=true

# If true, if the tensor node is built from either single getters, or index embeddings and
# scalar constant expressions, regardless of accesses, it will be inlined.
inline_simple_computations=true

# If true, virtualize_max_visits only counts accesses that are not used for assignment of
# the same cell (typically accumulation). Otherwise, all accesses are counted, so computations
# that reduce an axis will rarely be inlined.
# FIXME(#351): avoid excessive inlining while CSE is not implemented
inline_complex_computations=false

# The random number library. Options: `stdlib` -- `Base.Random`;
# `for_tests` -- simplistic randomness with 32 bit seed, focused on reproducibility.
randomness_lib=stdlib

# Low-level-code identifier syntax. Options: heuristic, name_and_label, name_only.
ll_ident_style=heuristic

# For ppx_minidebug non-flushing static backends, when non-empty, enables snapshotting of ongoing
# logging into a file, with the given frequency.
snapshot_every_sec=

# Whether ppx_minidebug entries should be tagged by time information.
# Options: not_tagged, clock, elapsed (relative to start of the program).
time_tagged=elapsed

# Whether ppx_minidebug should print the time span of each entry, and in what units.
# Options: not_reported, seconds, milliseconds, microseconds, nanoseconds.
elapsed_times=not_reported

# For ppx_minidebug, how file locations should be presented. Options:
# no_location, file_only, beg_line, beg_pos, range_line, range_pos.
location_format=beg_pos

# The ppx_minidebug logging backend (i.e. format). Options: db, text, html, markdown, flushing.
# The db backend requires a rendering client from ppx_minidebug, the other backends
# generate static (rendered) artifacts.
debug_backend=db

# For ppx_minidebug: a prefix for file positions relative to the project root.
# A more elaborate example:
# hyperlink_prefix=vscode://file//wsl.localhost/ubuntu-24.04/home/lukstafi/ocannl/
hyperlink_prefix=./

# For ppx_minidebug: whether to print IDs for log entries.
logs_print_scope_ids=false

# For ppx_minidebug.
logs_verbose_scope_ids=false

# For ppx_minidebug, whether logging from the main domain, `Domain.is_main ()`,
# should be directed to stdout rather than to a file.
log_main_domain_to_stdout=false

# For ppx_minidebug Table of Contents.
toc_entry_minimal_depth=
toc_entry_minimal_size=
# The span is expressed in units: ns, us or ms.
toc_entry_minimal_span=

# For ppx_minidebug: `|`-separated list of terms to highlight in the logs.
debug_highlights=

# For ppx_minidebug: a pcre syntax regular expression to highlight in the logs.
debug_highlight_pcre=

# For ppx_minidebug: if provided, enables highlighting differences between the current run
# and a previous run loaded from `.raw` files derived using the given prefix.
prev_run_prefix=

# For ppx_minidebug: a pcre syntax regular expression pattern that will be removed from messages
# before comparison when highlighting differences between runs, allowing certain differences to be ignored.
diff_ignore_pattern_pcre=

# If not empty, should be a number giving the depth till which non-highlighted ppx_minidebug
# logs are pruned. Useful when enabling logging (e.g. shape inference) for non-toy examples.
debug_log_prune_upto=

# For ppx_minidebug: (minimal) width of the search band for the minimal edit distance algorithm.
diff_max_distance_factor=

# For ppx_minidebug: semicolon-separated list of comma-separated entry IDs from
# the previous and current run, that should be considered the same by the diffing algorithm.
debug_scope_id_pairs=

# For ppx_minidebug: for generating file names. If empty, all logging will be done to stdout,
# regardless of the value of `log_main_domain_to_stdout`.
log_file_stem=debug

# For ppx_minidebug: If given a number, sets the `truncate_children` parameter: only this number
# of the most recent children of each log entry are held.`
debug_log_truncate_children=

# It is useful for testing to have outputs more uniform across backends even if that criples
# some backends. Currently, this setting only affects logging from routines to accomodate Metal's
# shortcoming.
prefer_backend_uniformity=false

# The initial value for the default precisions for tensors. The default precisions for values and
# gradients can be changed separately via the `Tensor` API.
default_prec=single

# Limit on the allowed size of unrolled constant tensor nodes
# (where the initialization code sets the values).
limit_constant_fill_size=16

# The timeout for the CC backend to wait for the compilation files to appear, in seconds.
# This is the wait after the compiler command exits (doesn't include the compilation time).
cc_backend_post_compile_timeout=10.0

# If true, the CC backend will fail if the codesign of the compiled dynamic library fails.
cc_backend_verify_codesign=false

# The maximal number of entries for the provenance of constraints, as for example reported
# in shape error exceptions.
max_shape_error_origins=20