Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
516 commits
Select commit Hold shift + click to select a range
497a649
Update common_validation.py
abhishek-sa1 Dec 2, 2025
2e105f5
Update validate_mapping_file.yml
abhishek-sa1 Dec 2, 2025
1b577a5
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
130cc74
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Milisha-Gupta Dec 2, 2025
471e623
Merge pull request #3761 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
f9e2f9d
defect fix for ldms
Milisha-Gupta Dec 2, 2025
f50865c
Update common_validation.py
Milisha-Gupta Dec 2, 2025
cdd579d
Update common_validation.py
Milisha-Gupta Dec 2, 2025
94d3186
Update common_validation.py
Milisha-Gupta Dec 2, 2025
33ddf32
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Milisha-Gupta Dec 2, 2025
5e8e565
epel update
abhishek-sa1 Dec 2, 2025
6e0d4a0
Merge pull request #3763 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
b7026d5
ldms defect fix
Milisha-Gupta Dec 2, 2025
39d19b3
Update en_us_validation_msg.py
Milisha-Gupta Dec 2, 2025
be372c6
Update common_validation.py
Milisha-Gupta Dec 2, 2025
b22fde3
check functional groups from mapping file
priti-parate Dec 2, 2025
3ad088c
fix for lint error
priti-parate Dec 2, 2025
d82c5bb
resolving pylint errors
priti-parate Dec 2, 2025
b622e1f
Merge pull request #3762 from Milisha-Gupta/defectfix
priti-parate Dec 2, 2025
c475f0a
fixed name issue
priti-parate Dec 2, 2025
482a9b1
ldms defect fix
Milisha-Gupta Dec 2, 2025
06db5c4
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Milisha-Gupta Dec 2, 2025
8834e27
updated service_k8s version in cloud-int
Katakam-Rakesh Dec 2, 2025
3dc5173
Update service_k8s.json
Milisha-Gupta Dec 2, 2025
6c124a5
Merge branch 'pub/k8s_telemetry' of github.com:Katakam-Rakesh/omnia i…
Katakam-Rakesh Dec 2, 2025
03d45a1
Update service_k8s.json
Milisha-Gupta Dec 2, 2025
3944aa2
Update main.yml
Milisha-Gupta Dec 2, 2025
6585cbf
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
jagadeeshnv Dec 2, 2025
4f842f6
Merge pull request #3760 from priti-parate/pub/k8s_telemetry
priti-parate Dec 2, 2025
1e5f8e9
updated code to add Realmemory to nodename info
Nagachandan-P Dec 2, 2025
fcc3692
ubuntu ldms image update
abhishek-sa1 Dec 2, 2025
89f7d65
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
e75115a
converting gb to mb and tested
Nagachandan-P Dec 2, 2025
2b43bd1
Update read_node_idrac.yml
snarthan Dec 2, 2025
efc4c7c
Merge pull request #3747 from Milisha-Gupta/pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
4ec6533
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 2, 2025
0693c0a
Merge pull request #3743 from Nagachandan-P/pub/k8s_telemetry
jagadeeshnv Dec 2, 2025
4531f67
Pip package installation cloudinit fix
Aditya-DP Dec 2, 2025
fd0a955
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 2, 2025
57fe9d3
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 2, 2025
b6b47f2
input validation of valid admin IPs in given range
priti-parate Dec 2, 2025
85c5271
fixing lint error
priti-parate Dec 2, 2025
bec66e5
ldms tag update
abhishek-sa1 Dec 3, 2025
235c6c1
build image update
abhishek-sa1 Dec 3, 2025
6c64bbf
base image update
abhishek-sa1 Dec 3, 2025
7e974e6
build image lint fix
abhishek-sa1 Dec 3, 2025
76cf12f
Merge pull request #3765 from Aditya-DP/pub/k8s_telemetry
priti-parate Dec 3, 2025
4f65168
real memory param
Nagachandan-P Dec 3, 2025
223295e
image update
abhishek-sa1 Dec 3, 2025
0bbfbfb
Updated with var read_node_idrac.yml
Nagachandan-P Dec 3, 2025
4a6e900
Merge pull request #3767 from Nagachandan-P/pub/k8s_telemetry
jagadeeshnv Dec 3, 2025
7208114
kubespray removal and checkmarx fixes
pullan1 Dec 3, 2025
cbfc5ac
Update values.yaml.j2
abhishek-sa1 Dec 3, 2025
df2075d
Merge pull request #3764 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 3, 2025
27395db
removed multus and whereabouts from service_k8s json
Katakam-Rakesh Dec 3, 2025
1981095
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Katakam-Rakesh Dec 3, 2025
353d7f5
Merge pull request #3756 from Katakam-Rakesh/pub/k8s_telemetry
priti-parate Dec 3, 2025
8754069
Added log when metadata is identical
pullan1 Dec 3, 2025
04e4fdb
removed inventory validation from utils
pullan1 Dec 3, 2025
1d6e905
corrected the imports
pullan1 Dec 3, 2025
d950091
Removed the unused task
pullan1 Dec 3, 2025
e8c414c
append errors instead of raining valuerror
priti-parate Dec 3, 2025
b914fab
discovery role readme update
abhishek-sa1 Dec 3, 2025
c19b015
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 3, 2025
7858e28
Update README.md
abhishek-sa1 Dec 3, 2025
17c23ab
Merge branch 'pub/k8s_telemetry' of https://github.com/abhishek-sa1/o…
abhishek-sa1 Dec 3, 2025
eb8bc3c
Update ci-group-service_kube_control_plane_x86_64.yaml.j2
Katakam-Rakesh Dec 3, 2025
5858e4e
Merge pull request #3768 from pullan1/pub/k8s_telemetry
snarthan Dec 3, 2025
fc4dce7
Merge pull request #3770 from Katakam-Rakesh/pub/k8s_telemetry
snarthan Dec 3, 2025
4f703f2
added crio-size and coredns restart
Katakam-Rakesh Dec 3, 2025
d80d2e6
update readme highlevel
abhishek-sa1 Dec 3, 2025
6211d4d
discovery doc update
abhishek-sa1 Dec 3, 2025
5595342
remove dependencies
abhishek-sa1 Dec 3, 2025
7ee90f2
removed unused variable
priti-parate Dec 3, 2025
0f6ae41
Merge pull request #3766 from priti-parate/pub/k8s_telemetry
priti-parate Dec 3, 2025
b5e7dbf
openldap security fix
balajikumaran-c-s Dec 3, 2025
3fbf4c8
Update kafka.tls_test_job.yaml.j2
abhishek-sa1 Dec 4, 2025
77aba81
Update main.yml
abhishek-sa1 Dec 4, 2025
91d40a8
Update main.yml
abhishek-sa1 Dec 4, 2025
dc7569e
Update main.yml
abhishek-sa1 Dec 4, 2025
8a1fcef
Merge pull request #3771 from balajikumaran-c-s/pub/k8s_telemetry
abhishek-sa1 Dec 4, 2025
99fcba2
Update README.md
abhishek-sa1 Dec 4, 2025
4712f48
update readme
abhishek-sa1 Dec 4, 2025
30e5e09
introduce polling mechanism for slurm controller
VrindaMarwah Dec 4, 2025
9a737a3
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
VrindaMarwah Dec 4, 2025
7cc0024
Merge pull request #3769 from abhishek-sa1/pub/k8s_telemetry
priti-parate Dec 4, 2025
5f4e360
introduce polling mechanism for slurm in login node and login compile…
VrindaMarwah Dec 4, 2025
09ea233
Merge branch 'pub/k8s_telemetry' of github.com:VrindaMarwah/omnia int…
VrindaMarwah Dec 4, 2025
646654a
modify install_cuda_toolkit path
VrindaMarwah Dec 4, 2025
84fa4f6
added arguments for kube-controller-manager
Katakam-Rakesh Dec 4, 2025
db66540
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Katakam-Rakesh Dec 4, 2025
f92b92c
Merge pull request #3772 from dell/pub/k8s_telemetry
abhishek-sa1 Dec 4, 2025
058e80b
Fix for Docker credential validation
pullan1 Dec 4, 2025
63e3179
moved import to top
pullan1 Dec 4, 2025
89a147d
added arguments for kube-controller-manager
Katakam-Rakesh Dec 4, 2025
78da47d
Merge branch 'pub/k8s_telemetry' of github.com:Katakam-Rakesh/omnia i…
Katakam-Rakesh Dec 4, 2025
1f748e9
Merge pull request #3774 from pullan1/pub/k8s_telemetry
snarthan Dec 4, 2025
4d42794
address pr comments
VrindaMarwah Dec 4, 2025
15c68e3
added arguments for kubelet config and coredns config map
Katakam-Rakesh Dec 5, 2025
237add6
fix cloud-init
Katakam-Rakesh Dec 5, 2025
5514ff9
updated cloud-int
Katakam-Rakesh Dec 5, 2025
a5aa022
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Katakam-Rakesh Dec 5, 2025
caef853
added msg for informing cloud-init completion
Katakam-Rakesh Dec 5, 2025
6d5671c
Merge branch 'pub/k8s_telemetry' of github.com:Katakam-Rakesh/omnia i…
Katakam-Rakesh Dec 5, 2025
7053c92
Updated the localrepo failure message
pullan1 Dec 5, 2025
c7eff98
Update omnia_config.yml
Katakam-Rakesh Dec 5, 2025
e0b722b
move polling script to templates dir
VrindaMarwah Dec 5, 2025
090adef
omnia core and auth 1.0 update
abhishek-sa1 Dec 5, 2025
2694291
Update deploy_auth_service.yml
abhishek-sa1 Dec 5, 2025
e5f0400
Enable idrac telemtry service for iDRAC IP's
nethramg Dec 5, 2025
83958c7
Update deploy_auth_service.yml
abhishek-sa1 Dec 5, 2025
4198053
Ansible lint fixes
nethramg Dec 5, 2025
49bbf28
auth container update
abhishek-sa1 Dec 5, 2025
40fae1c
Update main.yml
abhishek-sa1 Dec 5, 2025
f3b31e9
Update deploy_auth_service.yml
abhishek-sa1 Dec 5, 2025
b82ae3b
Merge pull request #3776 from Katakam-Rakesh/pub/k8s_telemetry
snarthan Dec 5, 2025
6619433
Update deploy_auth_service.yml
abhishek-sa1 Dec 5, 2025
11875d3
updated kube-proxy config map
Katakam-Rakesh Dec 5, 2025
77bc3ec
Merge branch 'pub/k8s_telemetry' of github.com:Katakam-Rakesh/omnia i…
Katakam-Rakesh Dec 5, 2025
516095e
Docker validation fix
pullan1 Dec 5, 2025
6a158ca
Merge pull request #3775 from pullan1/staging
snarthan Dec 5, 2025
5ae9b2e
Merge pull request #3779 from Katakam-Rakesh/pub/k8s_telemetry
snarthan Dec 5, 2025
0fbc9b8
Enable runtime image download for diskless nodes
balajikumaran-c-s Dec 5, 2025
584693a
Enable runtime image download for diskless nodes
balajikumaran-c-s Dec 5, 2025
267aa5a
Update omnia.sh
abhishek-sa1 Dec 5, 2025
42dc017
Few more changes
nethramg Dec 6, 2025
3744329
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
VrindaMarwah Dec 6, 2025
52d3aed
Remove timezone variable from input/provision_config.yml
priti-parate Dec 8, 2025
f8a9fbb
lint fix and adding thread safe measures
nethramg Dec 8, 2025
0900f2e
Adding the idrac ips variable corrected
nethramg Dec 8, 2025
2ea6375
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
nethramg Dec 8, 2025
96fe13a
remove timezone from schema
priti-parate Dec 8, 2025
62064ac
pylint fixes
nethramg Dec 8, 2025
41244b7
Merge pull request #3781 from priti-parate/pub/k8s_telemetry
abhishek-sa1 Dec 8, 2025
06f92ff
update the kubelet files
sakshi-singla-1735 Dec 8, 2025
8a62c3d
updating csi poll value
sakshi-singla-1735 Dec 8, 2025
0e6d6e5
removed csi values part
sakshi-singla-1735 Dec 8, 2025
b808a94
update kubelet
sakshi-singla-1735 Dec 8, 2025
7054326
updating csi poll value
sakshi-singla-1735 Dec 8, 2025
310e3e0
Merge pull request #3782 from sakshi-singla-1735/pub/k8s_telemetry
snarthan Dec 8, 2025
9683a35
Adding the login node details in pxe mapping file
nethramg Dec 8, 2025
ccaf8d2
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 8, 2025
c79404e
Merge pull request #3778 from nethramg/pub/k8s_telemetry
abhishek-sa1 Dec 8, 2025
eecd401
Timezone Fix
Milisha-Gupta Dec 8, 2025
ba9d3f9
Timezone Fix
Milisha-Gupta Dec 8, 2025
838536c
testing changes
Milisha-Gupta Dec 8, 2025
87cadeb
testing changes
Milisha-Gupta Dec 8, 2025
e285df2
testing changes
Milisha-Gupta Dec 8, 2025
f2edf2d
Update validate_oim_timezone.yml
Milisha-Gupta Dec 8, 2025
3aae5e8
Merge pull request #3783 from Milisha-Gupta/pub/k8s_telemetry
abhishek-sa1 Dec 8, 2025
4236181
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 8, 2025
359d39b
Updated with var read_node_idrac.yml
Nagachandan-P Dec 9, 2025
585bdfd
Merge pull request #3786 from Nagachandan-P/staging
snarthan Dec 9, 2025
61a46db
Merge pull request #3777 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 9, 2025
1a1c824
docker login issue fix
pullan1 Dec 9, 2025
43f6fcb
Merge pull request #3787 from pullan1/staging
snarthan Dec 10, 2025
187f53f
Merge pull request #3773 from VrindaMarwah/pub/k8s_telemetry
jagadeeshnv Dec 10, 2025
daab347
Remove timezone variable from input/provision_config.yml
priti-parate Dec 8, 2025
76bc55c
remove timezone from schema
priti-parate Dec 8, 2025
eeea102
Timezone Fix
Milisha-Gupta Dec 8, 2025
1e66912
Update validate_oim_timezone.yml
Milisha-Gupta Dec 8, 2025
2da643b
timezone
Milisha-Gupta Dec 10, 2025
97ebe48
Complete PR #3783: Fix timezone variable reference in all files
Milisha-Gupta Dec 10, 2025
28b48f1
Update omnia.sh
Milisha-Gupta Dec 10, 2025
3476b83
Merge pull request #3790 from Milisha-Gupta/staging
abhishek-sa1 Dec 10, 2025
d85bd97
Merge branch 'pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 10, 2025
9a49953
Update ci-group-slurm_control_node_x86_64.yaml.j2
balajikumaran-c-s Dec 10, 2025
ffe6ae0
Update ci-group-login_node_x86_64.yaml.j2
balajikumaran-c-s Dec 10, 2025
7f26e39
Update ci-group-login_compiler_node_x86_64.yaml.j2
balajikumaran-c-s Dec 10, 2025
127d1a2
Update ci-group-slurm_node_x86_64.yaml.j2
balajikumaran-c-s Dec 10, 2025
784c27c
updated variable for ucx and mpi mount
jagadeeshnv Dec 10, 2025
fa1f7e3
Update ci-group-service_kube_node_x86_64.yaml.j2
Katakam-Rakesh Dec 11, 2025
e383191
Update storage_config.json
jagadeeshnv Dec 11, 2025
a71b016
Merge pull request #3791 from jagadeeshnv/staging
snarthan Dec 11, 2025
3aa8d55
Merge pull request #3792 from Katakam-Rakesh/pub/k8s_telemetry
snarthan Dec 11, 2025
630a629
Merge pull request #3780 from balajikumaran-c-s/pub/k8s_telemetry
abhishek-sa1 Dec 11, 2025
483c874
Merge branch 'staging' into pub/k8s_telemetry
Katakam-Rakesh Dec 11, 2025
32623fe
Update process_parallel.py
Katakam-Rakesh Dec 11, 2025
e154148
Merge pull request #2 from Katakam-Rakesh/pub/k8s_telemetry
Katakam-Rakesh Dec 11, 2025
f1d5746
Update nfs_client.yml
jagadeeshnv Dec 11, 2025
37466aa
Merge pull request #3793 from Katakam-Rakesh/staging
abhishek-sa1 Dec 15, 2025
840c8be
localrepo defect fixes
pullan1 Dec 15, 2025
ae41c19
localrepo defect fixes
pullan1 Dec 15, 2025
1817294
Merge pull request #3795 from pullan1/staging
snarthan Dec 15, 2025
8b57e0c
updated localrepo failure message
pullan1 Dec 15, 2025
3bfe8b8
Merge pull request #3794 from pullan1/pub/k8s_telemetry
snarthan Dec 15, 2025
b939b98
add slurm support condition in slurm config tasks, modify logic for c…
VrindaMarwah Dec 15, 2025
b377143
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
VrindaMarwah Dec 15, 2025
a7c9d2f
lint fixes
VrindaMarwah Dec 15, 2025
ad14a42
Merge branch 'pub/k8s_telemetry' of github.com:VrindaMarwah/omnia int…
VrindaMarwah Dec 15, 2025
187f585
Merge pull request #3796 from VrindaMarwah/pub/k8s_telemetry
jagadeeshnv Dec 15, 2025
28813ad
add slurm support condition in slurm config tasks, modify logic for c…
VrindaMarwah Dec 15, 2025
a9eb1db
Merge pull request #3797 from VrindaMarwah/staging
jagadeeshnv Dec 15, 2025
6558d93
Pod failover timeout update
Aditya-DP Dec 15, 2025
d0ee6c2
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Aditya-DP Dec 15, 2025
be11e91
Update victoria-statefulset.yaml.j2
abhishek-sa1 Dec 15, 2025
f3c9ea5
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 15, 2025
f434c40
Revert "Update victoria-statefulset.yaml.j2"
abhishek-sa1 Dec 15, 2025
34cd22f
Remove cronjob for pod cleanup
Aditya-DP Dec 15, 2025
f475d1e
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 15, 2025
3802354
Remove golang image
Aditya-DP Dec 15, 2025
74b56a3
Merge branch 'staging' into pub/k8s_telemetry
abhishek-sa1 Dec 15, 2025
5c3f046
Merge pull request #3798 from Aditya-DP/pub/k8s_telemetry
abhishek-sa1 Dec 15, 2025
a628217
Merge pull request #3800 from Aditya-DP/pub/k8s_telemetry
abhishek-sa1 Dec 15, 2025
1c76c19
Mysql RWO config
Aditya-DP Dec 15, 2025
180ca8d
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 15, 2025
401da5f
Merge branch 'pub/k8s_telemetry' of https://github.com/abhishek-sa1/o…
abhishek-sa1 Dec 15, 2025
173ec18
Cleanup update
Aditya-DP Dec 15, 2025
012cd8c
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Aditya-DP Dec 15, 2025
0577bb7
Update idrac_telemetry_statefulset.yaml.j2
Aditya-DP Dec 15, 2025
d0504ef
Merge pull request #3801 from Aditya-DP/pub/k8s_telemetry
abhishek-sa1 Dec 15, 2025
0d1f961
Merge pull request #3802 from Aditya-DP/pub/k8s_telemetry
priti-parate Dec 15, 2025
7534347
mysql volume mutex update
Aditya-DP Dec 15, 2025
30c401c
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Aditya-DP Dec 15, 2025
6351c63
Merge pull request #3803 from Aditya-DP/pub/k8s_telemetry
priti-parate Dec 15, 2025
e9ced59
Merge pull request #3804 from Aditya-DP/pub/k8s_telemetry
priti-parate Dec 15, 2025
1511859
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
6c172d7
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 16, 2025
0e8b129
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
715148a
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
463a508
remove telemetry sampler input file
abhishek-sa1 Dec 16, 2025
fa5ecf9
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
abhishek-sa1 Dec 16, 2025
c6f6833
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
d7da93e
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
647860d
Merge pull request #3806 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 16, 2025
d413867
Telemetry lock cleanup update
Aditya-DP Dec 16, 2025
1532bcd
hostname mismatch
priti-parate Dec 16, 2025
54e0440
Merge pull request #3808 from priti-parate/pub/k8s_teleemtry_issue_fix
abhishek-sa1 Dec 16, 2025
11e910f
Merge pull request #3807 from priti-parate/pub/k8s_teleemtry_issue_fix
abhishek-sa1 Dec 17, 2025
5e55212
Merge pull request #3805 from abhishek-sa1/pub/k8s_telemetry
abhishek-sa1 Dec 17, 2025
96ab68e
Telemetry pod cleanup automation and lock file cleanup
Aditya-DP Dec 17, 2025
a295328
Merge branch 'dell:pub/k8s_telemetry' into pub/k8s_telemetry
Aditya-DP Dec 17, 2025
02d844e
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
52201d3
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 17, 2025
b0c0648
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
6258303
Merge branch 'pub/k8s_telemetry' of https://github.com/Aditya-DP/omni…
Aditya-DP Dec 17, 2025
0609916
Update main.yml
Katakam-Rakesh Dec 17, 2025
5e81637
Merge branch 'dell:staging' into staging
Katakam-Rakesh Dec 17, 2025
0c229dc
Merge pull request #3812 from Katakam-Rakesh/staging
jagadeeshnv Dec 17, 2025
09c8d57
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
85b4e96
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
8f20605
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
e927397
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
df429cf
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
09f9521
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
930a486
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
aea2e31
Update enable_telemetry_service.py
Milisha-Gupta Dec 17, 2025
d595dd4
Telemetry lock cleanup update
Aditya-DP Dec 17, 2025
11d7c1c
Merge pull request #3816 from Milisha-Gupta/staging
abhishek-sa1 Dec 17, 2025
018b75a
Merge pull request #3811 from Aditya-DP/pub/k8s_telemetry
abhishek-sa1 Dec 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .config/requirements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,3 @@ collections:
version: 1.16.2
- name: community.postgresql
version: 3.10.2
- name: https://github.com/kubernetes-sigs/kubespray
type: git
version: v2.25.0
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
rescue:
- name: Fail the build if the base image build fails
ansible.builtin.fail:
msg: "{{ base_image_failure_msg }}"
msg: |
{{ base_image_failure_msg }}

always:
- name: Remove generated base image vars file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@
ansible-playbook {{ openchami_clone_path }}/dell/podman-quadlets/image.yaml \
-i {{ aarch64_inventory_file }} -v \
--extra-vars '@{{ openchami_dir }}/{{ item.key }}_compute_images.yaml' \
-e "minio_s3_username={{ minio_s3_username }}" \
-e "minio_s3_password={{ minio_s3_password }}" \
--tags compute_image -v | \
/usr/bin/tee '{{ openchami_log_dir }}/{{ item.key }}_compute_image.log'
async: 3600 # Set async timeout (e.g., 1 hour)
Expand All @@ -63,21 +61,65 @@

- name: Wait for all OpenCHAMI jobs to finish and remove generated compute images templates
block:
- name: Display image build jobs status
ansible.builtin.debug:
msg: "Waiting for image build: {{ item.item.key }} (Job ID: {{ item.ansible_job_id }})"
loop: "{{ compute_image_build_job.results }}"
loop_control:
label: "{{ item.item.key }}"

- name: Wait for all OpenCHAMI jobs to finish
ansible.builtin.async_status:
jid: "{{ item.ansible_job_id }}"
register: job_result
until: job_result.finished
no_log: true
retries: "{{ job_retry }}"
delay: "{{ job_delay }}"
loop: "{{ compute_image_build_job.results }}"
loop_control:
label: "{{ item.item.key }}"
label: "Building: {{ item.item.key }}"

rescue:
- name: Fail explicitly if job failed
- name: Identify failed image builds
ansible.builtin.set_fact:
failed_images: >
{{ job_result.results
| selectattr('failed', 'defined')
| selectattr('failed', 'equalto', true)
| map(attribute='item.item.key')
| list }}
when: job_result.results is defined

- name: Build failure message list
ansible.builtin.set_fact:
failure_msg_list:
- "aarch64 compute image build job did not complete successfully."
- "Check logs at {{ openchami_log_dir }} for respective functional group for more details."
- ""
- "Failed images:"

- name: Add failed image names to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + [' - ' + item] }}"
loop: "{{ failed_images | default(['Unknown - check all logs']) }}"

- name: Add log paths section to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + ['', 'Check logs at ' + openchami_log_dir + ' for details:'] }}"

- name: Add log file paths to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + [' - ' + openchami_log_dir + '/' + item + '_compute_image.log'] }}"
loop: "{{ failed_images | default([]) }}"

- name: Display aarch64 compute image build failure details
ansible.builtin.debug:
msg: "{{ failure_msg_list }}"

- name: Failed to build the aarch64 compute image
ansible.builtin.fail:
msg: "{{ compute_image_failure_msg }}"
msg: "aarch64 compute image build failed. See details above."

always:
- name: Remove generated compute images templates
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ group_name: "{{ item.key }}"
rhel_base_compute_mounts: {{ ochami_compute_mounts | join(' ') }}
image_build_name: {{ ochami_aarch64_image | join (' ') }}
rhel_base_compute_command_options: {{ ochami_base_command | join (' ') }}
minio_s3_username: "{{ minio_s3_username }}"
minio_s3_password: "{{ minio_s3_password }}"

rhel_repos:
{% set rhel_repo = rhel_aarch64_repos %}
Expand Down
10 changes: 7 additions & 3 deletions build_image_aarch64/roles/image_creation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dir_permissions_644: "0644"
dir_permissions_755: "0755"
openchami_dir: "/opt/omnia/openchami"
openchami_clone_path: /opt/omnia/openchami/deployment-recipes
job_retry: "60"
job_retry: "120"
job_delay: "30"
openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir"
ochami_mounts:
Expand All @@ -43,8 +43,12 @@ openchami_aarch64_base_image_log_path: "{{ openchami_log_dir }}/aarch64_base_ima
openchami_base_image_vars_template: "{{ role_path }}/templates/base_image_template.j2"
openchami_aarch64_base_image_vars_path: "/opt/omnia/openchami/aarch64_base_image_template.yaml"
aarch64_inventory_file: "/tmp/temp_ochami_inventory.ini"
base_image_failure_msg: "Base aarch64 image build job failed or timed out. Check logs at path {{ openchami_aarch64_base_image_log_path }} for details."
compute_image_failure_msg: "Compute aarch64 image build job did not completed successfully."
base_image_failure_msg: |
Base aarch64 image build job failed or timed out.
Check logs at path {{ openchami_aarch64_base_image_log_path }} for details.
compute_image_failure_msg: |
aarch64 compute image build job did not complete successfully.
Check logs at {{ openchami_log_dir }} for respective functional group for more details.

# Usage: build_compute_image.yml
openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
rescue:
- name: Fail the build if the base image build fails
ansible.builtin.fail:
msg: "{{ base_image_failure_msg }}"
msg: |
{{ base_image_failure_msg }}

always:
- name: Remove generated base image vars file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@
ansible-playbook {{ openchami_clone_path }}/dell/podman-quadlets/image.yaml \
-i {{ openchami_clone_path }}/dell/podman-quadlets/inventory -v \
--extra-vars '@{{ openchami_dir }}/{{ item.key }}_compute_images.yaml' \
-e "minio_s3_username={{ minio_s3_username }}" \
-e "minio_s3_password={{ minio_s3_password }}" \
--tags compute_image -v | \
/usr/bin/tee '{{ openchami_log_dir }}/{{ item.key }}_compute_image.log'
async: 3600 # Set async timeout (e.g., 1 hour)
Expand All @@ -55,21 +53,65 @@

- name: Wait for all OpenCHAMI jobs to finish and remove generated compute images templates
block:
- name: Display image build jobs status
ansible.builtin.debug:
msg: "Waiting for image build: {{ item.item.key }} (Job ID: {{ item.ansible_job_id }})"
loop: "{{ compute_image_build_job.results }}"
loop_control:
label: "{{ item.item.key }}"

- name: Wait for all OpenCHAMI jobs to finish
ansible.builtin.async_status:
jid: "{{ item.ansible_job_id }}"
register: job_result
until: job_result.finished
no_log: true
retries: "{{ job_retry }}"
delay: "{{ job_delay }}"
loop: "{{ compute_image_build_job.results }}"
loop_control:
label: "{{ item.item.key }}"
label: "Building: {{ item.item.key }}"

rescue:
- name: Fail explicitly if job failed
- name: Identify failed image builds
ansible.builtin.set_fact:
failed_images: >
{{ job_result.results
| selectattr('failed', 'defined')
| selectattr('failed', 'equalto', true)
| map(attribute='item.item.key')
| list }}
when: job_result.results is defined

- name: Build failure message list
ansible.builtin.set_fact:
failure_msg_list:
- "x86_64 compute image build job did not complete successfully."
- "Check logs at {{ openchami_log_dir }} for respective functional group for more details."
- ""
- "Failed images:"

- name: Add failed image names to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + [' - ' + item] }}"
loop: "{{ failed_images | default(['Unknown - check all logs']) }}"

- name: Add log paths section to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + ['', 'Check logs at ' + openchami_log_dir + ' for details:'] }}"

- name: Add log file paths to message
ansible.builtin.set_fact:
failure_msg_list: "{{ failure_msg_list + [' - ' + openchami_log_dir + '/' + item + '_compute_image.log'] }}"
loop: "{{ failed_images | default([]) }}"

- name: Display x86_64 compute image build failure details
ansible.builtin.debug:
msg: "{{ failure_msg_list }}"

- name: Failed to build the x86_64 compute image
ansible.builtin.fail:
msg: "{{ compute_image_failure_msg }}"
msg: "x86_64 compute image build failed. See details above."

always:
- name: Remove generated compute images templates
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ group_name: "{{ item.key }}"
rhel_base_compute_mounts: {{ ochami_compute_mounts | join(' ') }}
image_build_name: {{ ochami_x86_64_image | join (' ') }}
rhel_base_compute_command_options: {{ ochami_base_command | join (' ') }}
minio_s3_username: "{{ minio_s3_username }}"
minio_s3_password: "{{ minio_s3_password }}"

rhel_repos:
{% set rhel_repo = rhel_x86_64_repos %}
Expand Down
10 changes: 7 additions & 3 deletions build_image_x86_64/roles/image_creation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dir_permissions_644: "0644"
dir_permissions_755: "0755"
openchami_dir: "/opt/omnia/openchami"
openchami_clone_path: /opt/omnia/openchami/deployment-recipes
job_retry: "60"
job_retry: "120"
job_delay: "30"
openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir"
ochami_mounts:
Expand All @@ -42,8 +42,12 @@ openchami_log_dir: /opt/omnia/log/openchami
openchami_x86_64_base_image_log_path: "{{ openchami_log_dir }}/x86_64_base_image.log"
openchami_base_image_vars_template: "{{ role_path }}/templates/base_image_template.j2"
openchami_x86_64_base_image_vars_path: "/opt/omnia/openchami/x86_64_base_image_template.yaml"
base_image_failure_msg: "Base x86_64 image build job failed or timed out. Check logs at path {{ openchami_x86_64_base_image_log_path }} for details."
compute_image_failure_msg: "Compute x86_64 image build job did not completed successfully."
base_image_failure_msg: |
Base x86_64 image build job failed or timed out.
Check logs at path {{ openchami_x86_64_base_image_log_path }} for details.
compute_image_failure_msg: |
x86_64 compute image build job did not complete successfully.
Check logs at {{ openchami_log_dir }} for respective functional group for more details.

# build_compute_image.yml
openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2"
Expand Down
30 changes: 20 additions & 10 deletions common/library/module_utils/input_validation/common_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,23 @@
"software_config": "software_config.json",
"storage_config": "storage_config.yml",
"telemetry_config": "telemetry_config.yml",
#"functional_groups_config": "functional_groups_config.yml",
"high_availability_config": "high_availability_config.yml"
# "additional_software": "additional_software.json"
}

# Tags and the files that will be run based off of it
input_file_inventory = {
"build_image": [files["provision_config"]],
"scheduler": [
files["software_config"],
#files['functional_groups_config'],

files["omnia_config"]
# files["high_availability_config"]
],
"provision": [
files["provision_config"],
files["network_spec"],
files["software_config"],
# files["functional_groups_config"]
# files["high_availability_config"]
],
"security": [
Expand All @@ -73,12 +72,10 @@
files["omnia_config"],
files["storage_config"],
files["high_availability_config"],
#files["functional_groups_config"]
],
"storage": [files["storage_config"]],
"prepare_oim": [
files["network_spec"],
#files["functional_groups_config"]
],
# "high_availability": [files["high_availability_config"]],
# "additional_software": [files["additional_software"]],
Expand All @@ -92,7 +89,6 @@
files["software_config"],
files["storage_config"],
files["high_availability_config"],
#files["functional_groups_config"]
],
}

Expand All @@ -102,10 +98,11 @@
"ofed": "24.10-1.1.4.0",
"beegfs": "7.4.5",
"intel_benchmarks": "2024.1.0",
"ucx": "1.15.0",
"openmpi": "4.1.6",
"csi_driver_powerscale": "v2.11.0",
"rocm": "6.3.1"
"ucx": "1.19.0",
"openmpi": "5.0.8",
"csi_driver_powerscale": "v2.15.0",
"rocm": "6.3.1",
"service_k8s": "1.34.1"
}

# All of the passwords fields
Expand Down Expand Up @@ -156,6 +153,19 @@

supported_telemetry_collection_type = ["victoria","kafka"]

FUNCTIONAL_GROUP_LAYER_MAP = {
"service_kube_control_plane_first_x86_64": "management",
"service_kube_control_plane_x86_64": "management",
"service_kube_node_x86_64": "management",
"login_node_x86_64": "management",
"login_node_aarch64": "management",
"login_compiler_node_x86_64": "management",
"login_compiler_node_aarch64": "management",
"slurm_control_node_x86_64": "management",
"slurm_node_x86_64": "compute",
"slurm_node_aarch64": "compute"
}

# used for security_config.yml validation
supported_ldap_connection_type = ["TLS","SLS"]
EMAIL_MAX_LENGTH = 320
Expand Down
Loading