16
16
import contextlib
17
17
import logging
18
18
import os
19
+ import threading
19
20
import time
20
21
from typing import Sequence
21
22
22
23
from opentelemetry .exporter .otlp .proto .common ._internal import trace_encoder
23
24
from opentelemetry .exporter .otlp .proto .http .trace_exporter import OTLPSpanExporter
25
+ from opentelemetry .exporter .zipkin .json import ZipkinExporter # type: ignore
24
26
from opentelemetry .instrumentation .urllib import URLLibInstrumentor
25
27
from opentelemetry .sdk .resources import Resource
26
28
from opentelemetry .sdk .trace import ReadableSpan , TracerProvider
35
37
# Trace `urllib` usage when talking to Pebble
36
38
URLLibInstrumentor ().instrument ()
37
39
38
- _OTLP_SPAN_EXPORTER_TIMEOUT = 1 # seconds
40
+ # NOTE: nominally int, although float would work just as well in practice
41
+ EXPORTER_TIMEOUT : int = 1 # seconds
39
42
"""How much to give OTLP span exporter has to push traces to the backend."""
40
43
41
- SENDOUT_FACTOR = 2
42
- """How much buffered chunks to send out for each incoming chunk."""
44
+ SENDOUT_FACTOR : int = 2
45
+ """How many buffered chunks to send out for each incoming chunk."""
43
46
44
- # FIXME: this creates a separate file next to the CHARM_STATE_FILE
45
- # We could stuff both kinds of data into the same file, I guess?
46
- BUFFER_FILE = '.tracing-data.db'
47
- # Currently ops.storage keeps one long transaction open for the duration of the
48
- # the dispatch, which means we can't use the same file from another thread.
49
- # BUFFER_FILE = '.unit-state.db'
47
+ BUFFER_FILE : str = '.tracing-data.db'
48
+ """Name of the file whither data is buffered, located next to .unit-state.db."""
50
49
51
50
51
+ logger = logging .getLogger (__name__ )
52
52
_exporter : ProxySpanExporter | None = None
53
53
54
54
55
+ # NOTE: OTEL SDK suppresses errors while exporting data
56
+ # TODO: decide if we need to remove this before going to prod
57
+ logger .addHandler (logging .StreamHandler ())
58
+
59
+
55
60
class ProxySpanExporter (SpanExporter ):
56
61
real_exporter : OTLPSpanExporter | None = None
62
+ zipkin_exporter : ZipkinExporter | None = None
63
+ settings : tuple [str | None , str | None ] = (None , None )
57
64
58
65
def __init__ (self , buffer_path : str ):
59
66
self .buffer = ops ._tracing .buffer .Buffer (buffer_path )
67
+ self .lock = threading .Lock ()
60
68
61
69
def export (self , spans : Sequence [ReadableSpan ]) -> SpanExportResult :
62
70
"""Export a batch of telemetry data.
63
71
64
72
Note: to avoid data loops or recursion, this function cannot be instrumented.
65
73
"""
66
- with suppress_juju_log_handler ():
67
- # Note:
68
- # this is called in a helper thread, which is daemonic,
69
- # the MainThread will wait at most 10s for this thread.
70
- # Margins:
71
- # - 1s safety margin
72
- # - 1s for buffered data time overhang
73
- # - 2s for live data
74
- deadline = time .monotonic () + 6
75
-
76
- assert spans # the BatchSpanProcessor won't call us if there's no data
77
- # TODO: this will change in the JSON experiment
78
- data : bytes = trace_encoder .encode_spans (spans ).SerializePartialToString ()
79
- rv = self .buffer .pump (data )
80
- assert rv
81
- self .do_export (* rv )
82
-
83
- for _ in range (SENDOUT_FACTOR - 1 ):
84
- if time .monotonic () > deadline :
85
- break
86
- if not (rv := self .buffer .pump ()):
87
- break
74
+ try :
75
+ with suppress_juju_log_handler ():
76
+ # Note:
77
+ # this is called in a helper thread, which is daemonic,
78
+ # the MainThread will wait at most 10s for this thread.
79
+ # Margins:
80
+ # - 1s safety margin
81
+ # - 1s for buffered data time overhang
82
+ # - 2s for live data
83
+ deadline = time .monotonic () + 6
84
+
85
+ assert spans # the BatchSpanProcessor won't call us if there's no data
86
+ # TODO: this will change in the JSON experiment
87
+ data : bytes = trace_encoder .encode_spans (spans ).SerializePartialToString ()
88
+ jsons = [s .to_json (indent = None ) for s in spans ] # type: ignore
89
+ f'{{"resourceSpans": [{ "," .join (jsons )} ]}}'
90
+ rv = self .buffer .pump (data )
91
+ assert rv
88
92
self .do_export (* rv )
89
93
90
- return SpanExportResult .SUCCESS
94
+ for _ in range (SENDOUT_FACTOR - 1 ):
95
+ if time .monotonic () > deadline :
96
+ break
97
+ if not (rv := self .buffer .pump ()):
98
+ break
99
+ self .do_export (* rv )
100
+
101
+ url , ca = self .settings
102
+ assert url
103
+ url = url .replace ('4318' , '4317' )
104
+ print (url , ca )
105
+ # rv = requests.post(
106
+ # url,
107
+ # data=json_payload,
108
+ # headers= {"Content-Type": "application/json"}, verify=ca,)
109
+ # print(rv)
110
+ assert self .zipkin_exporter
111
+ print (self .zipkin_exporter .export (spans ))
112
+ return SpanExportResult .SUCCESS
113
+ except Exception :
114
+ logger .exception ('export' )
115
+ raise
91
116
92
117
def do_export (self , buffered_id : int , data : bytes ) -> None :
93
118
"""Export buffered data and remove it from the buffer on success."""
94
119
# TODO: this will change in the JSON experiment
95
- if self .real_exporter and self .real_exporter ._export (data ).ok :
120
+ exporter = self .real_exporter
121
+ return
122
+ if exporter and exporter ._export (data ).ok :
96
123
self .buffer .remove (buffered_id )
97
124
98
125
def shutdown (self ) -> None :
99
126
"""Shut down the exporter."""
100
- if self .real_exporter :
101
- self . real_exporter .shutdown ()
127
+ if exporter := self .real_exporter :
128
+ exporter .shutdown ()
102
129
103
130
def force_flush (self , timeout_millis : int = 30000 ) -> bool :
104
131
"""No-op, as the real exporter doesn't buffer."""
105
132
return True
106
133
107
- def set_real_exporter (self , exporter : OTLPSpanExporter ) -> None :
108
- self .real_exporter = exporter
109
-
110
134
111
135
@contextlib .contextmanager
112
136
def suppress_juju_log_handler ():
@@ -132,6 +156,16 @@ def setup_tracing(charm_class_name: str) -> None:
132
156
133
157
resource = Resource .create (
134
158
attributes = {
159
+ # https://opentelemetry.io/docs/languages/sdk-configuration/general/
160
+ # https://github.com/open-telemetry/semantic-conventions/tree/main/docs/resource#semantic-attributes-with-dedicated-environment-variable
161
+ #
162
+ # OTEL defines some standard-ish attributes:
163
+ # service.name required
164
+ # service.instance.id recommended
165
+ # service.namespace recommended -- maybe model name?
166
+ # service.version recommended
167
+ # Following same attribute names as charm_tracing lib
168
+ # FIXME: decide if it makes sense
135
169
'service.name' : service_name ,
136
170
'compose_service' : service_name , # FIXME why is this copy needed?
137
171
'charm_type' : charm_class_name ,
@@ -153,33 +187,50 @@ def setup_tracing(charm_class_name: str) -> None:
153
187
set_tracer_provider (provider )
154
188
155
189
156
- # FIXME make it very cheap to call this method a second time with same arguments
157
190
def set_tracing_destination (
158
191
* ,
159
192
url : str | None ,
160
193
ca : str | None ,
161
194
) -> None :
162
- # FIXME needs a threading.Lock
163
- # or access to underlying BatchXXX lock
164
- #
165
- # - check if settings are exactly same, do nothing in that case
166
- # - replace current exported with a new exporter
195
+ # FIXME only if it's a path, obv...
196
+ # should we also check that this path exists?
167
197
if ca is not None and not ca .startswith ('/' ):
168
198
raise ValueError (f'{ ca = } must be an absolute path' )
169
- assert _exporter
170
-
171
- # real exporter, hardcoded for now
172
- real_exporter = OTLPSpanExporter (url , timeout = 1 )
173
- # This is actually the max delay value in the sequence 1, 2, ..., MAX
174
- # Set to 1 to disable sending live data (buffered data is still eventually sent)
175
- # Set to 2 (or more) to enable sending live data (after buffered)
176
- #
177
- # _MAX_RETRY_TIMEOUT = 2 with timeout=1 means:
178
- # - 1 attempt to send live, 1s sleep in the worst case
179
- # _MAX_RETRY_TIMEOUT = 3 or 4 with timeout=1 means:
180
- # - 1st attempt, 1s sleep, 2nd attempt, 1s sleep in the worst case
181
- real_exporter ._MAX_RETRY_TIMEOUT = 2 # pyright: ignore[reportAttributeAccessIssue]
182
- _exporter .set_real_exporter (real_exporter )
199
+
200
+ assert _exporter , 'tracing has not been set up'
201
+ with _exporter .lock :
202
+ if (url , ca ) != _exporter .settings :
203
+ if url :
204
+ # real exporter, hardcoded for now
205
+ real_exporter = OTLPSpanExporter (url , timeout = EXPORTER_TIMEOUT )
206
+ # FIXME: shouldn't be hardcoded...
207
+ # FIXME API design: if it OK to force the protocol and endpoint
208
+ # switch onto the charmers, our users?
209
+ #
210
+ # OTLP protobuf URL is host:4318/v1/traces
211
+ # Zipkin v2 JSON URL is host:9411/api/v2/spans
212
+ #
213
+ json_url = 'http://localhost:9411/api/v2/spans'
214
+ # TODO: session=<custom session that groks ca= better>
215
+ zipkin_exporter = ZipkinExporter (
216
+ endpoint = json_url , timeout = EXPORTER_TIMEOUT
217
+ ) # FIXME timeout, etc
218
+ # This is actually the max delay value in the sequence 1, 2, ..., MAX
219
+ # Set to 1 to disable sending live data (buffered data is still eventually sent)
220
+ # Set to 2 (or more) to enable sending live data (after buffered)
221
+ #
222
+ # _MAX_RETRY_TIMEOUT = 2 with timeout=1 means:
223
+ # - 1 attempt to send live, 1s sleep in the worst case
224
+ # _MAX_RETRY_TIMEOUT = 3 or 4 with timeout=1 means:
225
+ # - 1st attempt, 1s sleep, 2nd attempt, 1s sleep in the worst case
226
+ real_exporter ._MAX_RETRY_TIMEOUT = 2 # pyright: ignore[reportAttributeAccessIssue]
227
+ else :
228
+ real_exporter = zipkin_exporter = None
229
+
230
+ _exporter .real_exporter = real_exporter
231
+ _exporter .zipkin_exporter = zipkin_exporter
232
+ _exporter .settings = (url , ca )
233
+
183
234
_exporter .buffer .mark_observed ()
184
235
185
236
0 commit comments