InftyAI
diff --git a/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 15 additions & 13 deletions b/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 15 additions & 13 deletions
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎chart/Chart.yaml
Lines changed: 2 additions & 2 deletions b/‎chart/Chart.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎chart/crds/backendruntime-crd.yaml
Lines changed: 1095 additions & 6 deletions b/‎chart/crds/backendruntime-crd.yaml
Lines changed: 1095 additions & 6 deletions
diff --git a/‎chart/crds/openmodel-crd.yaml
Lines changed: 62 additions & 54 deletions b/‎chart/crds/openmodel-crd.yaml
Lines changed: 62 additions & 54 deletions
@@ -2,34 +2,36 @@
 name: New Release
 about: Propose a new release
 title: Release v0.x.0
-labels: ''
-assignees: ''
-
+labels: ""
+assignees: ""
 ---
 
 ## Release Checklist
+
 <!--
 Please do not remove items from the checklist
 -->
+
 - [ ] All [OWNERS](https://github.com/inftyai/llmaz/blob/main/OWNERS) must LGTM the release proposal
 - [ ] Prepare the image and files
-  - [ ] Run `PLATFORMS=linux/amd64 make image-push GIT_TAG=$VERSION`  to build and push an image.
+  - [ ] Run `PLATFORMS=linux/amd64 make image-push GIT_TAG=$VERSION` to build and push an image.
   - [ ] Run `make artifacts GIT_TAG=$VERSION` to generate the artifact.
-- [ ] Update `chart/Chart.yaml` and `docs/installation.md`, the helm version is different with the app version.
+- [ ] Update helm chats and documents
+  - [ ] Update `chart/Chart.yaml` and `docs/installation.md`, the helm version is different with the app version.
   - [ ] Run `make helm-package` to package the helm chart and update the index.yaml.
   - [ ] Submit a PR and merge it.
 - [ ] An OWNER [prepares a draft release](https://github.com/inftyai/llmaz/releases)
   - [ ] Create a new tag
   - [ ] Write the change log into the draft release which should include below items if any:
-      ```
-      🚀 **Major Features**:
-      ✨ **Features**:
-      🐛 **Bugs**:
-      ♻️ **Cleanups**:
-      ```
+    ```
+    🚀 **Major Features**:
+    ✨ **Features**:
+    🐛 **Bugs**:
+    ♻️ **Cleanups**:
+    ```
   - [ ] Upload the files to the draft release.
-      - [ ] `manifests.yaml` under artifacts
-      - [ ] new generated helm chart `*.zip` file
+    - [ ] `manifests.yaml` under artifacts
+    - [ ] new generated helm chart `*.zip` file
 - [ ] Publish the draft release prepared at the [Github releases page](https://github.com/inftyai/llmaz/releases)
 - [ ] Publish the helm chart
   - [ ] Run `git checkout gh-pages`
 
@@ -290,7 +290,7 @@ HELMIFY ?= $(LOCALBIN)/helmify
 .PHONY: helmify
 helmify: $(HELMIFY) ## Download helmify locally if necessary.
 $(HELMIFY): $(LOCALBIN)
-	test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/helmify@latest
+	test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/helmify@v0.4.17
 
 .PHONY: helm
 helm: manifests kustomize helmify
 
@@ -13,9 +13,9 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.5
+version: 0.0.6
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: 0.0.9
+appVersion: 0.1.0
@@ -55,60 +55,66 @@ spec:
                   FamilyName represents the model type, like llama2, which will be auto injected
                   to the labels with the key of `llmaz.io/model-family-name`.
                 type: string
-              inferenceFlavors:
-                description: |-
-                  InferenceFlavors represents the accelerator requirements to serve the model.
-                  Flavors are fungible following the priority represented by the slice order.
-                items:
-                  description: |-
-                    Flavor defines the accelerator requirements for a model and the necessary parameters
-                    in autoscaling. Right now, it will be used in two places:
-                    - Pod scheduling with node selectors specified.
-                    - Cluster autoscaling with essential parameters provided.
-                  properties:
-                    name:
-                      description: Name represents the flavor name, which will be
-                        used in model claim.
-                      type: string
-                    nodeSelector:
-                      additionalProperties:
-                        type: string
-                      description: |-
-                        NodeSelector represents the node candidates for Pod placements, if a node doesn't
-                        meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
-                        If nodeSelector is empty, it means every node is a candidate.
-                      type: object
-                    params:
-                      additionalProperties:
-                        type: string
-                      description: |-
-                        Params stores other useful parameters and will be consumed by the autoscaling components
-                        like cluster-autoscaler, Karpenter.
-                        E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
-                        instance-type: p4d.24xlarge for AWS.
-                      type: object
-                    requests:
-                      additionalProperties:
-                        anyOf:
-                        - type: integer
-                        - type: string
-                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                        x-kubernetes-int-or-string: true
+              inferenceConfig:
+                description: InferenceConfig represents the inference configurations
+                  for the model.
+                properties:
+                  flavors:
+                    description: |-
+                      Flavors represents the accelerator requirements to serve the model.
+                      Flavors are fungible following the priority represented by the slice order.
+                    items:
                       description: |-
-                        Requests defines the required accelerators to serve the model for each replica,
-                        like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-                        the resource requirements for each replica. This may change in the future.
-                        Not recommended to set the cpu and memory usage here:
-                        - if using playground, you can define the cpu/mem usage at backendConfig.
-                        - if using inference service, you can define the cpu/mem at the container resources.
-                        However, if you define the same accelerator requests at playground/service as well,
-                        the requests here will be covered.
+                        Flavor defines the accelerator requirements for a model and the necessary parameters
+                        in autoscaling. Right now, it will be used in two places:
+                        - Pod scheduling with node selectors specified.
+                        - Cluster autoscaling with essential parameters provided.
+                      properties:
+                        name:
+                          description: Name represents the flavor name, which will
+                            be used in model claim.
+                          type: string
+                        nodeSelector:
+                          additionalProperties:
+                            type: string
+                          description: |-
+                            NodeSelector represents the node candidates for Pod placements, if a node doesn't
+                            meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
+                            If nodeSelector is empty, it means every node is a candidate.
+                          type: object
+                        params:
+                          additionalProperties:
+                            type: string
+                          description: |-
+                            Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
+                            for autoscaling or be defined as model parallelism parameters like TP or PP size.
+                            E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
+                            with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
+                            Preset parameters: TP, PP, INSTANCE-TYPE.
+                          type: object
+                        requests:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: |-
+                            Requests defines the required accelerators to serve the model for each replica,
+                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+                            the resource requirements for each replica, usually equals to the TP size.
+                            Not recommended to set the cpu and memory usage here:
+                            - if using playground, you can define the cpu/mem usage at backendConfig.
+                            - if using inference service, you can define the cpu/mem at the container resources.
+                            However, if you define the same accelerator requests at playground/service as well,
+                            the requests will be overwritten by the flavor requests.
+                          type: object
+                      required:
+                      - name
                       type: object
-                  required:
-                  - name
-                  type: object
-                maxItems: 8
-                type: array
+                    maxItems: 8
+                    type: array
+                type: object
               source:
                 description: |-
                   Source represents the source of the model, there're several ways to load
@@ -158,8 +164,10 @@ spec:
                     type: object
                   uri:
                     description: |-
-                      URI represents a various kinds of model sources following the uri protocol, e.g.:
-                      - OSS: oss://<bucket>.<endpoint>/<path-to-your-model>
+                      URI represents a various kinds of model sources following the uri protocol, protocol://<address>, e.g.
+                      - oss://<bucket>.<endpoint>/<path-to-your-model>
+                      - ollama://llama3.3
+                      - host://<path-to-your-model>
                     type: string
                 type: object
             required: