-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathplaybook-windows.yml
More file actions
308 lines (279 loc) · 11.6 KB
/
playbook-windows.yml
File metadata and controls
308 lines (279 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# We need to have a separate playbook for windows
- name: Prepare Controller (Install gh)
hosts: localhost
gather_facts: yes # Required for role dependencies
vars:
# FORCE these variables to prevent "undefined" errors on minimal runners
ansible_os_family: Debian
ansible_distribution: Ubuntu
# The 'gh' CLI tool is required on the controller (localhost) because the Windows
# hosts fetch their reports back to this machine, and the 'gh release upload'
# task is delegated to run here, not on the Windows instances themselves.
roles:
- role: andrewrothstein.gh
vars:
gh_version: "2.36.0"
gh_os: linux
gh_arch: amd64
- name: (windows) Install CrowdStrike Falcon
hosts: windows
# The any_errors_fatal field doesn't work with free strategy.
# If there are spurious failures in the PLAY, it is good to run
# the playbook again by removing free strategy.
any_errors_fatal: true
strategy: free
gather_facts: no
vars:
ansible_connection_timeout: 300
tasks:
- name: Establish connection to Windows host
block:
- name: Wait for connection to be available
wait_for_connection:
delay: 15
timeout: 300
connect_timeout: 60
- name: Verify connection with ping
ansible.windows.win_ping:
rescue:
- name: Reset connection on failure
ansible.builtin.meta: reset_connection
- name: Retry connection with extended timeout
wait_for_connection:
delay: 30
timeout: 600
connect_timeout: 120
retries: 5
delay: 60
register: retry_conn
until: retry_conn is succeeded
- name: Gather facts for first time
ansible.builtin.setup:
# - name: Install crowdstrike falcon
# ansible.builtin.include_role:
# name: caos.ansible_roles.install_crowdstrike_falcon
# vars:
# falcon_client_id: "{{ lookup('env', 'CROWDSTRIKE_CLIENT_ID') }}"
# falcon_client_secret: "{{ lookup('env', 'CROWDSTRIKE_CLIENT_SECRET') }}"
# falcon_customer_id: "{{ lookup('env', 'CROWDSTRIKE_CUSTOMER_ID') }}"
# api_base_url: "https://api.laggar.gcw.crowdstrike.com"
# win_download_path: 'C:\Windows\Temp\falcon-sensor.exe'
- name: Provision Windows test executor instances and execute tests
hosts: windows
# The any_errors_fatal field doesn't work with free strategy.
# If there are spurious failures in the PLAY, it is good to run
# the playbook again by removing free strategy.
any_errors_fatal: true
strategy: free
vars:
ansible_win_async_startup_timeout: 180
ansible_connection_timeout: 300
node_version: 21.6.0
# The following information is populated using the gathered inventory variables
fluent_bit_package_name: "{{ tags.fb_package_name }}"
pre_release_name: "{{ tags.pre_release_name }}"
fb_version: "{{ tags.fb_version }}"
os_distro: "{{ tags.os_distro }}"
os_version: "{{ tags.os_version }}"
arch: "{{ tags.arch }}"
new_relic_api_key: "{{ lookup('ansible.builtin.env', 'NEW_RELIC_API_KEY') }}"
new_relic_account_id: "{{ lookup('ansible.builtin.env', 'NEW_RELIC_ACCOUNT_ID') }}"
new_relic_region: "{{ lookup('ansible.builtin.env', 'NEW_RELIC_REGION') }}"
monitored_file: 'C:\Windows\Temp\tail-log-file-test'
monitored_tcp_port: 5170
monitored_windows_log_name_using_winevtlog: 'Application'
monitored_windows_log_name_using_winlog: 'Application'
test_suite_folder: 'C:\Windows\Temp\test-suite'
test_suite_report_path: '{{ test_suite_folder }}\reports\tests\test-report.xml'
test_reports_dir: /tmp/test-reports
node_path: 'C:\Program Files\nodejs\'
# Value set in versions/common.yml (nrFbOutputPluginVersion) and can be overridden via extra vars (-e nr_fb_output_plugin_version=<version>)
nr_fb_output_plugin_version: "{{ nr_fb_output_plugin_version }}"
environment:
NEW_RELIC_API_KEY: "{{ new_relic_api_key }}"
NEW_RELIC_ACCOUNT_ID: "{{ new_relic_account_id }}"
NEW_RELIC_REGION: "{{ new_relic_region }}"
tasks:
- name: Establish connection to Windows host
block:
- name: Wait for connection to be available
wait_for_connection:
delay: 15
timeout: 300
connect_timeout: 60
- name: Verify connection
ansible.windows.win_ping:
rescue:
- name: Reset connection on failure
ansible.builtin.meta: reset_connection
- name: Retry connection with extended timeout
wait_for_connection:
delay: 30
timeout: 600
connect_timeout: 120
retries: 5
delay: 60
register: retry_conn
until: retry_conn is succeeded
- name: Install Infrastructure Agent
ansible.builtin.include_role:
name: newrelic.newrelic_install
# Despite being the default, I want to emphasize that we do NOT want the role variables to be
# exposed to the play. Otherwise, the "tags" variable specified below would overwrite the AWS "tags"
# variable object (that comes from the inventory), which would cause that the `fluent_bit_package_name`
# (or any variable depending on the AWS tags) would not be resolvable. Note that this option is only
# available in "include_role", and not in "import_role".
public: false
vars:
targets:
- infrastructure
tags:
product: logging
owning_team: logging
project: fluent-bit-packaging-and-testing
- name: Configure log forwarding
ansible.builtin.include_role:
name: create_logging_configs
- name: Install Fluent Bit package to be tested for this distro
ansible.builtin.include_role:
name: install_fluent_bit_from_gh_prerelease
vars:
fb_package_name: "{{ fluent_bit_package_name }}"
gh_prerelease_tag: "{{ pre_release_name }}"
when: pre_release_name is not regex('^local-.*')
- name: Install NR Fluent Bit output
ansible.builtin.include_role:
name: install_nr_fluent_bit_output
vars:
plugin_arch: "{{ (arch == 'win64') | ternary('amd64','386') }}"
plugin_version: "{{ nr_fb_output_plugin_version }}"
- name: Install Node
win_chocolatey:
name: nodejs
version: '{{ node_version }}'
- name: Capture contents of PATH environment variable
ansible.windows.win_shell: echo "$env:PATH"
register: original_path
- name: Include node paths into patched_path
set_fact:
patched_path: '{{ original_path.stdout }};{{ node_path }}'
- name: Copy test suite
ansible.windows.win_copy:
src: ../../integration-tests/test-suite
dest: 'C:\Windows\Temp\'
- name: Install test-suite dependencies
ansible.windows.win_shell: 'npm i'
args:
chdir: '{{ test_suite_folder }}'
environment:
PATH: '{{ patched_path }}'
- name: Remove previous reports
ansible.windows.win_file:
path: '{{ test_suite_folder }}\reports'
state: absent
- name: Validate fb_version variable is set and valid
ansible.builtin.assert:
that:
- fb_version is defined
- fb_version | length > 0
- fb_version is match('^[0-9]+\.[0-9]+\.[0-9]+(?:-[\w.-]+)?$')
fail_msg: "fb_version must be set to a valid semantic version (e.g., 4.2.2 or 4.2.2-beta). Current value: {{ fb_version | default('UNDEFINED') }}"
success_msg: "fb_version is valid: {{ fb_version }}"
- name: Run test-suite (async with timeout)
ansible.windows.win_shell: 'npm run test'
args:
chdir: '{{ test_suite_folder }}'
creates: '{{ test_suite_folder }}\reports'
async: 1800 # 30 minutes timeout
poll: 0 # fire and forget, check status separately
environment:
PATH: '{{ patched_path }}'
LOGGING_ENDPOINT: https://log-api.newrelic.com/log/v1
NERD_GRAPH_URL: https://api.newrelic.com/graphql
API_KEY: "{{ new_relic_api_key }}"
ACCOUNT_ID: "{{ new_relic_account_id }}"
MONITORED_FILE: '{{ monitored_file }}'
MONITORED_TCP_PORT: "{{ monitored_tcp_port }}"
MONITORED_WINDOWS_LOG_NAME_USING_WINLOG: "{{ monitored_windows_log_name_using_winevtlog }}"
MONITORED_WINDOWS_LOG_NAME_USING_WINEVTLOG: "{{ monitored_windows_log_name_using_winlog }}"
EXPECTED_FB_VERSION: "{{ fb_version }}"
register: test_job
# Robust async status check with connection recovery for Windows
- name: Wait for test-suite to complete (with reconnection support)
block:
- name: Check async job status
ansible.builtin.async_status:
jid: "{{ test_job.ansible_job_id }}"
register: test_result
until: test_result.finished
retries: 60 # retry up to 60 times
delay: 30 # wait 30 seconds between retries (total ~30 min)
rescue:
- name: Attempt to re-establish connection after failure
wait_for_connection:
delay: 15
timeout: 180
connect_timeout: 60
retries: 5
delay: 30
register: reconnect_attempt
until: reconnect_attempt is succeeded
- name: Reset connection
ansible.builtin.meta: reset_connection
- name: Verify connection is working
ansible.windows.win_ping:
retries: 3
delay: 10
- name: Retry async status check after reconnection
ansible.builtin.async_status:
jid: "{{ test_job.ansible_job_id }}"
register: test_result
until: test_result.finished
retries: 30
delay: 30
ignore_errors: yes
- name: Log test result status
ansible.builtin.debug:
msg: "Test result for {{ inventory_hostname }}: finished={{ test_result.finished | default('unknown') }}"
when: test_result is defined
# Reconnect before fetch
- name: Re-establish connection before fetching results
block:
- name: Wait for connection
wait_for_connection:
delay: 5
timeout: 180
connect_timeout: 60
- name: Verify with ping
ansible.windows.win_ping:
rescue:
- name: Reset connection on failure
ansible.builtin.meta: reset_connection
- name: Retry connection
wait_for_connection:
delay: 15
timeout: 300
connect_timeout: 120
retries: 5
delay: 45
- name: Fetch results
fetch:
flat: true # behave like copy, we only want the file, not the whole path
src: '{{ test_suite_report_path }}'
dest: '{{ test_reports_dir }}/{{ fb_version }}_{{ os_distro }}_{{ os_version }}_{{ arch }}.xml'
register: fetch_result
retries: 5
delay: 15
until: fetch_result is succeeded
- name: Rename Windows report for merging (Unique per host)
ansible.builtin.copy:
src: "{{ test_reports_dir }}/{{ fb_version }}_{{ os_distro }}_{{ os_version }}_{{ arch }}.xml"
dest: "{{ test_reports_dir }}/{{ lookup('env', 'TEST_REPORT_NAME') | regex_replace('\\.xml$', '') }}-windows-{{ os_version }}-{{ arch }}.xml"
delegate_to: localhost
- name: Upload Windows report to release
command: "gh release upload {{ pre_release_name }} {{ test_reports_dir }}/{{ lookup('env', 'TEST_REPORT_NAME') | regex_replace('\\.xml$', '') }}-windows-{{ os_version }}-{{ arch }}.xml --clobber"
delegate_to: localhost
register: upload_result
until: upload_result.rc == 0
retries: 5
delay: 10