Jittor
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions b/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎extern/cuda/cub/ops/cub_where_op.cc‎
Lines changed: 2 additions & 0 deletions b/‎extern/cuda/cub/ops/cub_where_op.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎extern/cuda/cutt/ops/cutt_transpose_op.cc‎
Lines changed: 1 addition & 0 deletions b/‎extern/cuda/cutt/ops/cutt_transpose_op.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/jittor/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎python/jittor/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/jittor/compile_extern.py‎
Lines changed: 3 additions & 3 deletions b/‎python/jittor/compile_extern.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/jittor/misc.py‎
Lines changed: 1 addition & 1 deletion b/‎python/jittor/misc.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/jittor/models/res2net.py‎
Lines changed: 2 additions & 2 deletions b/‎python/jittor/models/res2net.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/jittor/test/perf/perf.py‎
Lines changed: 221 additions & 0 deletions b/‎python/jittor/test/perf/perf.py‎
Lines changed: 221 additions & 0 deletions
diff --git a/‎python/jittor/test/system/test_all.sh‎
Lines changed: 6 additions & 0 deletions b/‎python/jittor/test/system/test_all.sh‎
Lines changed: 6 additions & 0 deletions
@@ -26,3 +26,4 @@ venv/
 python/jittor.egg-info
 dist/
 !doc/source/*
+__data__
@@ -0,0 +1,2 @@
+exclude __data__
+exclude __pycache__
@@ -10,6 +10,8 @@
 #include "cub_where_op.h"
 #ifdef JIT_cuda
 #include "executor.h"
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
 #include <assert.h>
 #include <executor.h>
 #include <cub/cub.cuh>
 
@@ -9,6 +9,7 @@
 #include "cutt.h"
 #include "cutt_warper.h"
 #include "misc/stack_vector.h"
+#include "helper_cuda.h"
 
 namespace jittor {
 
 
@@ -7,7 +7,7 @@
 # This file is subject to the terms and conditions defined in
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
-__version__ = '1.2.1.3'
+__version__ = '1.2.2.0'
 from . import lock
 with lock.lock_scope():
     ori_int = int
@@ -92,6 +92,7 @@ class log_capture_scope(_call_no_record_scope):
         print(logs)
     """
     def __init__(self, **jt_flags):
+        jt_flags["use_parallel_op_compiler"] = 0
         self.fs = flag_scope(**jt_flags)
 
     def __enter__(self):
 
@@ -78,9 +78,9 @@ def setup_mkl():
 
 
 def install_cub(root_folder):
-    url = "https://github.com/NVIDIA/cub/archive/1.11.0-rc1.tar.gz"
-    filename = "cub-1.11.0-rc1.tgz"
-    md5 = "f395687060bed7eaeb5fa8a689276ede"
+    url = "https://github.com/NVIDIA/cub/archive/1.11.0.tar.gz"
+    filename = "cub-1.11.0.tgz"
+    md5 = "97196a885598e40592100e1caaf3d5ea"
     fullname = os.path.join(root_folder, filename)
     dirname = os.path.join(root_folder, filename.replace(".tgz",""))
 
 
@@ -33,7 +33,7 @@ def __iter__(x):
     return result.__iter__()
 jt.Var.__iter__ = __iter__
 
-def all(x,dim):
+def all(x, dim=[]):
     return x.all_(dim).bool()
 jt.Var.all = all
 
 
@@ -175,10 +175,10 @@ def execute(self, input):
         x = self.layer4(x)
         return x, low_level_feat
 
-def res2net50(output_stride):
+def res2net50(output_stride=16):
     model = Res2Net(Bottle2neck, [3,4,6,3], output_stride)
     return model
 
-def res2net101(output_stride):
+def res2net101(output_stride=16):
     model = Res2Net(Bottle2neck, [3,4,23,3], output_stride)
     return model
@@ -0,0 +1,221 @@
+import sys, os
+
+suffix = ""
+
+import jittor as jt
+import time
+from pathlib import Path
+home_path = str(Path.home())
+perf_path = os.path.join(home_path, ".cache", "jittor_perf")
+
+def main():
+    os.makedirs(perf_path+"/src/jittor", exist_ok=True)
+    os.makedirs(perf_path+"/src/jittor_utils", exist_ok=True)
+    os.system(f"cp -rL {jt.flags.jittor_path} {perf_path+'/src/'}")
+    os.system(f"cp -rL {jt.flags.jittor_path}/../jittor_utils {perf_path+'/src/'}")
+    use_torch_1_4 = os.environ.get("use_torch_1_4", "0") == "1"
+    dockerfile_src = r"""
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install  \
+        pybind11 \
+        numpy \
+        tqdm \
+        pillow \
+        astunparse
+
+RUN pip3 install torch torchvision
+"""
+    global suffix
+    if use_torch_1_4:
+        suffix = "_1_4"
+        dockerfile_src = dockerfile_src.replace("torch ", "torch==1.4.0 ")
+        dockerfile_src = dockerfile_src.replace("torchvision", "torchvision==0.5.0")
+    with open("/tmp/perf_dockerfile", 'w') as f:
+        f.write(dockerfile_src)
+    assert os.system("sudo nvidia-smi -lgc 1500") == 0
+    assert os.system(f"sudo docker build --tag jittor/jittor-perf{suffix} -f /tmp/perf_dockerfile .") == 0
+    # run once for compile source
+    jt_fps = test_main("jittor", "resnet50", 1)
+    
+    logs = ""
+    # resnext50_32x4d with bs=8 cannot pass this test
+    #### inference test
+    for model_name in ["resnet50", "wide_resnet50_2", # "resnext50_32x4d", 
+        "resnet152", "wide_resnet101_2", "resnext101_32x8d", 
+        "alexnet", "vgg11", "squeezenet1_1", "mobilenet_v2", 
+        "densenet121", "densenet169", "densenet201",
+        "res2net50", "res2net101"]:
+        for bs in [1, 2, 4, 8, 16, 32, 64, 128]:
+            jt_fps = test_main("jittor", model_name, bs)
+            logs += f"jittor-{model_name}-{bs} {jt_fps}\n"
+            tc_fps = test_main("torch", model_name, bs)
+            logs += f"torch-{model_name}-{bs} {tc_fps}\n"
+            logs += f"compare-{model_name}-{bs} {jt_fps/tc_fps}\n"
+            print(logs)
+    #### train test
+    for model_name in ["train_resnet50", "train_resnet101"
+        ]:
+        for bs in [1, 2, 4, 8, 16, 32, 64, 128]:
+            jt_fps = test_main("jittor", model_name, bs)
+            logs += f"jittor-{model_name}-{bs} {jt_fps}\n"
+            tc_fps = test_main("torch", model_name, bs)
+            logs += f"torch-{model_name}-{bs} {tc_fps}\n"
+            logs += f"compare-{model_name}-{bs} {jt_fps/tc_fps}\n"
+            print(logs)
+    with open(f"{perf_path}/jittor-perf{suffix}-latest.txt", "w") as f:
+        f.write(logs)
+    from datetime import datetime
+    with open(f"{perf_path}/jittor-perf{suffix}-{datetime.now()}.txt", "w") as f:
+        f.write(logs)
+
+def test_main(name, model_name, bs):
+    cmd = f"sudo docker run --gpus all --rm -v {perf_path}:/root/.cache/jittor --network host jittor/jittor-perf{suffix} bash -c 'PYTHONPATH=/root/.cache/jittor/src python3.7 /root/.cache/jittor/src/jittor/test/perf/perf.py {name} {model_name} {bs}'"
+    fps = -1
+    try:
+        print("run cmd:", cmd)
+        if os.system(cmd) == 0:
+            with open(f"{perf_path}/{name}-{model_name}-{bs}.txt", 'r') as f:
+                fps = float(f.read().split()[3])
+    except:
+        pass
+    return fps
+
+def time_iter(duration=2, min_iter=5):
+    start = time.time()
+    for i in range(10000000):
+        yield i
+        end = time.time()
+        if end-start>duration and i>=min_iter:
+            return
+
+def test(name, model_name, bs):
+    print("hello", name, model_name, bs)
+    import numpy as np
+    import time
+    is_train = False
+    _model_name = model_name
+    if model_name.startswith("train_"):
+        is_train = True
+        model_name = model_name[6:]
+    if name == "torch":
+        import torch
+        import torchvision.models as tcmodels
+        from torch import optim
+        from torch import nn
+        torch.backends.cudnn.deterministic = False
+        torch.backends.cudnn.benchmark = True
+        model = tcmodels.__dict__[model_name]()
+        model = model.cuda()
+    else:
+        import jittor as jt
+        from jittor import optim
+        from jittor import nn
+        jt.flags.use_cuda = 1
+        jt.cudnn.set_algorithm_cache_size(10000)
+        import jittor.models as jtmodels
+        model = jtmodels.__dict__[model_name]()
+        if (model == "resnet152" or model == "resnet101") and bs == 128 and is_train:
+            jt.cudnn.set_max_workspace_ratio(0.05)
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+    img_size = 224
+    if model_name == "inception_v3":
+        img_size = 300
+    test_img = np.random.random((bs, 3, img_size, img_size)).astype("float32")
+    if is_train:
+        label = (np.random.random((bs,)) * 1000).astype("int32")
+    if name == "torch":
+        test_img = torch.Tensor(test_img).cuda()
+        if is_train:
+            label = torch.LongTensor(label).cuda()
+            opt = optim.SGD(model.parameters(), 0.001)
+        sync = lambda: torch.cuda.synchronize()
+        jt = torch
+    else:
+        test_img = jt.array(test_img).stop_grad()
+        if is_train:
+            label = jt.array(label).stop_grad()
+            opt = optim.SGD(model.parameters(), 0.001)
+        sync = lambda: jt.sync_all(True)
+
+    sync()
+    use_profiler = os.environ.get("use_profiler", "0") == "1"
+    if hasattr(jt, "nograd"):
+        ng = jt.no_grad()
+        ng.__enter__()
+    def iter():
+        x = model(test_img)
+        if isinstance(x, tuple):
+            x = x[0]
+        if is_train:
+            loss = nn.CrossEntropyLoss()(x, label)
+            if name == "jittor":
+                opt.step(loss)
+            else:
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+        else:
+            x.sync()
+    sync()
+    for i in time_iter():
+        iter()
+    sync()
+    for i in time_iter():
+        iter()
+    sync()
+    if use_profiler:
+        if name == "torch":
+            prof = torch.autograd.profiler.profile(use_cuda=True)
+        else:
+            prof = jt.profile_scope()
+        prof.__enter__()
+    if name == "jittor":
+        if hasattr(jt.flags, "use_parallel_op_compiler"):
+            jt.flags.use_parallel_op_compiler = 0
+    start = time.time()
+    for i in time_iter(10):
+        iter()
+    sync()
+    end = time.time()
+    if use_profiler:
+        prof.__exit__(None,None,None)
+        if name == "torch":
+            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
+    total_iter = i+1
+    print("duration:", end-start, "FPS:", total_iter*bs/(end-start))
+    fpath = f"{home_path}/.cache/jittor/{name}-{_model_name}-{bs}.txt"
+    with open(fpath, 'w') as f:
+        f.write(f"duration: {end-start} FPS: {total_iter*bs/(end-start)}")
+    os.chmod(fpath, 0x666)
+
+if len(sys.argv) <= 1:
+    main()
+else:
+    name, model, bs = sys.argv[1:]
+    bs = int(bs)
+    test(name, model, bs)
@@ -0,0 +1,6 @@
+bash python/jittor/test/system/test_cuda10.0_ubuntu16.04.sh
+bash python/jittor/test/system/test_cuda10.0_ubuntu18.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu16.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu18.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu20.04.sh
+bash python/jittor/test/system/test_nocuda_ubuntu18.04.sh
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+exclude __data__`
	`2`	`+exclude __pycache__`