Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions estela-api/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
GOOGLE_APPLICATION_LOCATION=(str, "dummy"),
MAX_CLI_DOWNLOAD_CHUNK_MB=(int, 2),
MAX_WEB_DOWNLOAD_SIZE_MB=(int, 1024),
MULTI_NODE_MODE=(str, "False"),
DEDICATED_SPIDER_NODES=(str, "True"),
SPIDER_NODE_ROLE=(str, "bitmaker-worker"),
NODE_CAPACITY_THRESHOLD=(float, 0.95),
WORKERS_CAPACITY_THRESHOLD=(float, 0.95),
DISPATCH_RETRY_DELAY=(int, 30),
RUN_JOBS_PER_LOT=(int, 100),
BUCKET_NAME_PROJECTS=(str, "dummy"),
Expand Down Expand Up @@ -251,9 +251,9 @@


# Cluster settings
MULTI_NODE_MODE = env("MULTI_NODE_MODE")
DEDICATED_SPIDER_NODES = env("DEDICATED_SPIDER_NODES")
SPIDER_NODE_ROLE = env("SPIDER_NODE_ROLE")
NODE_CAPACITY_THRESHOLD = env("NODE_CAPACITY_THRESHOLD")
WORKERS_CAPACITY_THRESHOLD = env("WORKERS_CAPACITY_THRESHOLD")
DISPATCH_RETRY_DELAY = env("DISPATCH_RETRY_DELAY")
RUN_JOBS_PER_LOT = env("RUN_JOBS_PER_LOT")
CHECK_JOB_ERRORS_BATCH_SIZE = 100
Expand Down
28 changes: 17 additions & 11 deletions estela-api/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import redis
from kubernetes import client, config

NODE_CAPACITY_THRESHOLD = settings.NODE_CAPACITY_THRESHOLD
WORKERS_CAPACITY_THRESHOLD = settings.WORKERS_CAPACITY_THRESHOLD

def get_default_token(job):
user = job.spider.project.users.first()
Expand Down Expand Up @@ -73,8 +73,8 @@ def run_spider_jobs():
new_cpu = used_cpu + job_cpu
new_mem = used_mem + job_mem

if (alloc_cpu > 0 and (new_cpu / alloc_cpu) >= NODE_CAPACITY_THRESHOLD) or \
(alloc_mem > 0 and (new_mem / alloc_mem) >= NODE_CAPACITY_THRESHOLD):
if (alloc_cpu > 0 and (new_cpu / alloc_cpu) >= WORKERS_CAPACITY_THRESHOLD) or \
(alloc_mem > 0 and (new_mem / alloc_mem) >= WORKERS_CAPACITY_THRESHOLD):
skipped += 1
continue

Expand Down Expand Up @@ -119,9 +119,6 @@ def _dispatch_single_job(job):

token = get_default_token(job)

job.status = SpiderJob.WAITING_STATUS
job.save()

job_manager.create_job(
job.name,
job.key,
Expand All @@ -135,17 +132,25 @@ def _dispatch_single_job(job):
resource_tier=job.resource_tier,
)

job.status = SpiderJob.WAITING_STATUS
job.save()


def _get_cluster_resources():
try:
config.load_incluster_config()
v1 = client.CoreV1Api()

dedicated = settings.DEDICATED_SPIDER_NODES == "True"
spider_node_role = settings.SPIDER_NODE_ROLE
nodes = v1.list_node(label_selector=f"role={spider_node_role}")

if dedicated:
nodes = v1.list_node(label_selector=f"role={spider_node_role}")
else:
nodes = v1.list_node()

if not nodes.items:
logging.warning("No worker nodes found with label role=%s", spider_node_role)
logging.warning("No worker nodes found")
return None

total_allocatable_mem = 0
Expand Down Expand Up @@ -173,10 +178,11 @@ def _get_cluster_resources():
)
for pod in pending_pods.items:
if pod.spec.node_name:
continue
node_selector = pod.spec.node_selector or {}
if node_selector.get("role") != spider_node_role:
continue
if dedicated:
node_selector = pod.spec.node_selector or {}
if node_selector.get("role") != spider_node_role:
continue
for container in pod.spec.containers:
requests = (container.resources.requests or {}) if container.resources else {}
total_requested_mem += _parse_k8s_resource(requests.get("memory", "0"))
Expand Down
2 changes: 1 addition & 1 deletion estela-api/engines/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def create_job_object(
else ([volume] if volume else None)
),
node_selector={"role": settings.SPIDER_NODE_ROLE}
if settings.MULTI_NODE_MODE == "True"
if settings.DEDICATED_SPIDER_NODES == "True"
else None,
)
if not isbuild:
Expand Down
4 changes: 2 additions & 2 deletions installation/helm-chart/templates/API/api-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ data:
DJANGO_API_HOST: http://{{ .Values.DJANGO_API_HOST }}
DJANGO_ALLOWED_HOSTS: {{ .Values.DJANGO_API_HOST }},{{ .Values.DJANGO_ALLOWED_HOSTS }},127.0.0.1
STAGE: {{ .Values.STAGE }}
MULTI_NODE_MODE: {{ .Values.MULTINODE | quote}}
DEDICATED_SPIDER_NODES: {{ .Values.DEDICATED_SPIDER_NODES | quote}}
SPIDER_NODE_ROLE: {{ .Values.SPIDER_NODE_ROLE | quote }}
NODE_CAPACITY_THRESHOLD: {{ .Values.NODE_CAPACITY_THRESHOLD | quote }}
WORKERS_CAPACITY_THRESHOLD: {{ .Values.WORKERS_CAPACITY_THRESHOLD | quote }}
DISPATCH_RETRY_DELAY: {{ .Values.DISPATCH_RETRY_DELAY | quote }}
RUN_JOBS_PER_LOT: {{ .Values.RUN_JOBS_PER_LOT | quote }}
BUILD: {{ .Values.BUILD }}
Expand Down
8 changes: 7 additions & 1 deletion installation/helm-chart/values.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ DJANGO_ALLOWED_HOSTS: ""
DJANGO_EXTERNAL_APPS: "" # "app_1,app_2,..."
EXTERNAL_APP_KEYS: "" # "key_1,key_2,..."
EXTERNAL_MIDDLEWARES: "" # "app1.middlware,app2.middlware"
MULTINODE: "" # "False"
DEDICATED_SPIDER_NODES: "" # "True"
Comment thread
erick-GeGe marked this conversation as resolved.
Outdated
BUILD: "" # "default"

# Celery
Expand Down Expand Up @@ -107,6 +107,12 @@ SIZE_THRESHOLD: ""
INSERT_TIME_THRESHOLD: ""
ACTIVITY_TIME_THRESHOLD: ""

############ SPIDER JOB RESOURCES #########
SPIDER_NODE_ROLE: "bitmaker-worker"
WORKERS_CAPACITY_THRESHOLD: "0.95"
DISPATCH_RETRY_DELAY: "30"
RUN_JOBS_PER_LOT: "100"

############ RESOURCES LIMITS #############
API_CPU_LIMIT: "" # "250m"
API_MEM_LIMIT: "" # "1Gi"
Expand Down