-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy pathconstants.py
126 lines (95 loc) · 4.65 KB
/
constants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
"""Constants shared across the PyAirbyte codebase."""
from __future__ import annotations
import os
from pathlib import Path
DEBUG_MODE = False # Set to True to enable additional debug logging.
AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at"
"""A column that stores the timestamp when the record was extracted."""
AB_META_COLUMN = "_airbyte_meta"
"""A column that stores metadata about the record."""
AB_RAW_ID_COLUMN = "_airbyte_raw_id"
"""A column that stores a unique identifier for each row in the source data.
Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations.
In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte,
this column is simply used as a unique identifier for each record as it is received.
PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time
received. This allows us to determine the debug the order of records as they are received, even if
the source provides records that are tied or received out of order from the perspective of their
`emitted_at` (`_airbyte_extracted_at`) timestamps.
"""
AB_INTERNAL_COLUMNS = {
AB_RAW_ID_COLUMN,
AB_EXTRACTED_AT_COLUMN,
AB_META_COLUMN,
}
"""A set of internal columns that are reserved for PyAirbyte's internal use."""
DEFAULT_CACHE_SCHEMA_NAME = "airbyte_raw"
"""The default schema name to use for caches.
Specific caches may override this value with a different schema name.
"""
DEFAULT_CACHE_ROOT: Path = (
Path() / ".cache"
if "AIRBYTE_CACHE_ROOT" not in os.environ
else Path(os.environ["AIRBYTE_CACHE_ROOT"])
)
"""Default cache root is `.cache` in the current working directory.
The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
Overriding this can be useful if you always want to store cache files in a specific location.
For example, in ephemeral environments like Google Colab, you might want to store cache files in
your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
"""
DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
"""The default number of records to include in each batch of an Arrow dataset."""
def _str_to_bool(value: str) -> bool:
"""Convert a string value of an environment values to a boolean value."""
return bool(value) and value.lower() not in {"", "0", "false", "f", "no", "n", "off"}
TEMP_DIR_OVERRIDE: Path | None = (
Path(os.environ["AIRBYTE_TEMP_DIR"]) if os.getenv("AIRBYTE_TEMP_DIR") else None
)
"""The directory to use for temporary files.
This value is read from the `AIRBYTE_TEMP_DIR` environment variable. If the variable is not set,
Tempfile will use the system's default temporary directory.
This can be useful if you want to store temporary files in a specific location (or) when you
need your temporary files to exist in user level directories, and not in system level
directories for permissions reasons.
"""
TEMP_FILE_CLEANUP = _str_to_bool(
os.getenv(
key="AIRBYTE_TEMP_FILE_CLEANUP",
default="true",
)
)
"""Whether to clean up temporary files after use.
This value is read from the `AIRBYTE_TEMP_FILE_CLEANUP` environment variable. If the variable is
not set, the default value is `True`.
"""
AIRBYTE_OFFLINE_MODE = _str_to_bool(
os.getenv(
key="AIRBYTE_OFFLINE_MODE",
default="false",
)
)
"""Enable or disable offline mode.
When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the
Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in
environments without internet access or with air-gapped networks.
Offline mode also disables telemetry, similar to a `DO_NOT_TRACK` setting, ensuring no usage data
is sent from your environment. You may also specify a custom registry URL via the`_REGISTRY_ENV_VAR`
environment variable if you prefer to use a different registry source for metadata.
This setting helps you make informed choices about data privacy and operation in restricted and
air-gapped environments.
"""
AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = _str_to_bool(
os.getenv(
key="AIRBYTE_PRINT_FULL_ERROR_LOGS",
default=os.getenv("CI", "false"),
)
)
"""Whether to print full error logs when an error occurs.
This setting helps in debugging by providing detailed logs when errors occur. This is especially
helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after
the pipeline run.
If not set, the default value is `False` for non-CI environments.
If running in a CI environment ("CI" env var is set), then the default value is `True`.
"""