Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.

Commit da7798e

Browse files
johnugeorgek8s-ci-robot
authored andcommitted
Adding v1beta2 API implementation (#138)
* Adding v1beta2 API implementation * Build v1beta2
1 parent 8c9dc74 commit da7798e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+16309
-27
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ install:
1414

1515
script:
1616
- go build -o pytorch-operator.v1beta1 github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta1
17+
- go build -o pytorch-operator.v1beta2 github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta2
1718
- gometalinter --config=linter_config.json ./pkg/...
1819
# We customize the build step because by default
1920
# Travis runs go test -v ./... which will include the vendor

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
FROM debian:jessie
22

33
COPY pytorch-operator.v1beta1 /pytorch-operator.v1beta1
4+
COPY pytorch-operator.v1beta2 /pytorch-operator.v1beta2
45

56
ENTRYPOINT ["/pytorch-operator", "-alsologtostderr"]

Gopkg.lock

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

build_image.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ ln -s ${CONTEXT_DIR} ${GO_DIR}
2222
cd ${GO_DIR}
2323
echo "Build pytorch operator v1beta1 binary"
2424
go build github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta1
25+
echo "Build pytorch operator v1beta2 binary"
26+
go build github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta2
2527

2628
echo "Building container in gcloud"
2729
gcloud builds submit . --tag=${IMAGE}:${TAG}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright 2018 The Kubeflow Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package options
16+
17+
import (
18+
"flag"
19+
20+
"k8s.io/api/core/v1"
21+
)
22+
23+
// ServerOption is the main context object for the controller manager.
24+
type ServerOption struct {
25+
Kubeconfig string
26+
MasterURL string
27+
Threadiness int
28+
PrintVersion bool
29+
JSONLogFormat bool
30+
EnableGangScheduling bool
31+
Namespace string
32+
}
33+
34+
// NewServerOption creates a new CMServer with a default config.
35+
func NewServerOption() *ServerOption {
36+
s := ServerOption{}
37+
return &s
38+
}
39+
40+
// AddFlags adds flags for a specific CMServer to the specified FlagSet.
41+
func (s *ServerOption) AddFlags(fs *flag.FlagSet) {
42+
fs.StringVar(&s.MasterURL, "master", "",
43+
`The url of the Kubernetes API server,
44+
will overrides any value in kubeconfig, only required if out-of-cluster.`)
45+
46+
fs.StringVar(&s.Namespace, "namespace", v1.NamespaceAll,
47+
`The namespace to monitor pytorch jobs. If unset, it monitors all namespaces cluster-wide.
48+
If set, it only monitors pytorch jobs in the given namespace.`)
49+
50+
fs.IntVar(&s.Threadiness, "threadiness", 1,
51+
`How many threads to process the main logic`)
52+
53+
fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit")
54+
55+
fs.BoolVar(&s.JSONLogFormat, "json-log-format", true,
56+
"Set true to use json style log format. Set false to use plaintext style log format")
57+
fs.BoolVar(&s.EnableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling by kube-batch.")
58+
}
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
// Copyright 2018 The Kubeflow Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package app
16+
17+
import (
18+
"fmt"
19+
"os"
20+
"time"
21+
22+
log "github.com/sirupsen/logrus"
23+
"k8s.io/api/core/v1"
24+
crdclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
kubeinformers "k8s.io/client-go/informers"
27+
kubeclientset "k8s.io/client-go/kubernetes"
28+
restclientset "k8s.io/client-go/rest"
29+
"k8s.io/client-go/tools/clientcmd"
30+
election "k8s.io/client-go/tools/leaderelection"
31+
"k8s.io/client-go/tools/leaderelection/resourcelock"
32+
"k8s.io/client-go/tools/record"
33+
34+
"github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta2/app/options"
35+
"github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta2"
36+
jobclientset "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned"
37+
"github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme"
38+
jobinformers "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions"
39+
controller "github.com/kubeflow/pytorch-operator/pkg/controller.v1beta2/pytorch"
40+
"github.com/kubeflow/tf-operator/pkg/util/signals"
41+
"github.com/kubeflow/tf-operator/pkg/version"
42+
)
43+
44+
const (
45+
apiVersion = "v1beta2"
46+
)
47+
48+
var (
49+
// leader election config
50+
leaseDuration = 15 * time.Second
51+
renewDuration = 5 * time.Second
52+
retryPeriod = 3 * time.Second
53+
resyncPeriod = 30 * time.Second
54+
)
55+
56+
const RecommendedKubeConfigPathEnv = "KUBECONFIG"
57+
58+
func Run(opt *options.ServerOption) error {
59+
// Check if the -version flag was passed and, if so, print the version and exit.
60+
if opt.PrintVersion {
61+
version.PrintVersionAndExit(apiVersion)
62+
}
63+
64+
namespace := os.Getenv(v1beta2.EnvKubeflowNamespace)
65+
if len(namespace) == 0 {
66+
log.Infof("EnvKubeflowNamespace not set, use default namespace")
67+
namespace = metav1.NamespaceDefault
68+
}
69+
70+
// To help debugging, immediately log version.
71+
log.Infof("%+v", version.Info(apiVersion))
72+
73+
// Set up signals so we handle the first shutdown signal gracefully.
74+
stopCh := signals.SetupSignalHandler()
75+
76+
// Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option.
77+
if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 {
78+
// use the current context in kubeconfig
79+
// This is very useful for running locally.
80+
opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv)
81+
}
82+
83+
// Get kubernetes config.
84+
kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig)
85+
if err != nil {
86+
log.Fatalf("Error building kubeconfig: %s", err.Error())
87+
}
88+
89+
// Create clients.
90+
kubeClientSet, leaderElectionClientSet, pytorchJobClientSet, err := createClientSets(kcfg)
91+
if err != nil {
92+
return err
93+
}
94+
95+
// Create informer factory.
96+
kubeInformerFactory := kubeinformers.NewFilteredSharedInformerFactory(kubeClientSet, resyncPeriod, opt.Namespace, nil)
97+
pytorchJobInformerFactory := jobinformers.NewSharedInformerFactory(pytorchJobClientSet, resyncPeriod)
98+
99+
unstructuredInformer := controller.NewUnstructuredPyTorchJobInformer(kcfg, opt.Namespace)
100+
101+
// Create pytorch controller.
102+
tc := controller.NewPyTorchController(unstructuredInformer, kubeClientSet, pytorchJobClientSet, kubeInformerFactory, pytorchJobInformerFactory, *opt)
103+
104+
// Start informer goroutines.
105+
go kubeInformerFactory.Start(stopCh)
106+
107+
go unstructuredInformer.Informer().Run(stopCh)
108+
109+
// Set leader election start function.
110+
run := func(<-chan struct{}) {
111+
if err := tc.Run(opt.Threadiness, stopCh); err != nil {
112+
log.Errorf("Failed to run the controller: %v", err)
113+
}
114+
}
115+
116+
id, err := os.Hostname()
117+
if err != nil {
118+
return fmt.Errorf("failed to get hostname: %v", err)
119+
}
120+
121+
// Prepare event clients.
122+
eventBroadcaster := record.NewBroadcaster()
123+
if err = v1.AddToScheme(scheme.Scheme); err != nil {
124+
return fmt.Errorf("coreV1 Add Scheme failed: %v", err)
125+
}
126+
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "pytorch-operator"})
127+
128+
rl := &resourcelock.EndpointsLock{
129+
EndpointsMeta: metav1.ObjectMeta{
130+
Namespace: namespace,
131+
Name: "pytorch-operator",
132+
},
133+
Client: leaderElectionClientSet.CoreV1(),
134+
LockConfig: resourcelock.ResourceLockConfig{
135+
Identity: id,
136+
EventRecorder: recorder,
137+
},
138+
}
139+
140+
// Start leader election.
141+
election.RunOrDie(election.LeaderElectionConfig{
142+
Lock: rl,
143+
LeaseDuration: leaseDuration,
144+
RenewDeadline: renewDuration,
145+
RetryPeriod: retryPeriod,
146+
Callbacks: election.LeaderCallbacks{
147+
OnStartedLeading: run,
148+
OnStoppedLeading: func() {
149+
log.Fatalf("leader election lost")
150+
},
151+
},
152+
})
153+
154+
return nil
155+
}
156+
157+
func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, jobclientset.Interface, error) {
158+
159+
crdClient, err := crdclient.NewForConfig(config)
160+
161+
if err != nil {
162+
return nil, nil, nil, err
163+
}
164+
165+
checkCRDExists(crdClient, v1beta2.PytorchCRD)
166+
167+
kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "pytorch-operator"))
168+
if err != nil {
169+
return nil, nil, nil, err
170+
}
171+
172+
leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election"))
173+
if err != nil {
174+
return nil, nil, nil, err
175+
}
176+
177+
jobClientSet, err := jobclientset.NewForConfig(config)
178+
if err != nil {
179+
return nil, nil, nil, err
180+
}
181+
182+
return kubeClientSet, leaderElectionClientSet, jobClientSet, nil
183+
}
184+
185+
func checkCRDExists(clientset crdclient.Interface, crdName string) {
186+
_, err := clientset.ApiextensionsV1beta1().CustomResourceDefinitions().Get(crdName, metav1.GetOptions{})
187+
188+
if err != nil {
189+
log.Error(err)
190+
os.Exit(1)
191+
}
192+
}

cmd/pytorch-operator.v1beta2/main.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2018 The Kubeflow Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"flag"
19+
20+
"github.com/onrik/logrus/filename"
21+
log "github.com/sirupsen/logrus"
22+
23+
"github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta2/app"
24+
"github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta2/app/options"
25+
)
26+
27+
func init() {
28+
// Add filename as one of the fields of the structured log message.
29+
filenameHook := filename.NewHook()
30+
filenameHook.Field = "filename"
31+
log.AddHook(filenameHook)
32+
}
33+
34+
func main() {
35+
s := options.NewServerOption()
36+
s.AddFlags(flag.CommandLine)
37+
38+
flag.Parse()
39+
40+
if s.JSONLogFormat {
41+
// Output logs in a json format so that it can be parsed by services like Stackdriver.
42+
log.SetFormatter(&log.JSONFormatter{})
43+
}
44+
45+
if err := app.Run(s); err != nil {
46+
log.Fatalf("%v\n", err)
47+
}
48+
49+
}

hack/update-codegen.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,17 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge
3030
# instead of the $GOPATH directly. For normal projects this can be dropped.
3131
${CODEGEN_PKG}/generate-groups.sh "defaulter,deepcopy,client,informer,lister" \
3232
github.com/kubeflow/pytorch-operator/pkg/client github.com/kubeflow/pytorch-operator/pkg/apis \
33-
pytorch:v1beta1 \
33+
pytorch:v1beta1,v1beta2 \
3434
--go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt
3535

3636
echo "Generating defaulters for pytorch v1beta1"
3737
${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1 \
3838
-O zz_generated.defaults \
3939
--go-header-file ./hack/../hack/boilerplate/boilerplate.go.txt \
4040
--output-package github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1
41+
42+
echo "Generating defaulters for pytorch v1beta2"
43+
${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta2 \
44+
-O zz_generated.defaults \
45+
--go-header-file ./hack/../hack/boilerplate/boilerplate.go.txt \
46+
--output-package github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta2

0 commit comments

Comments
 (0)