microsoft
diff --git a/‎README.md‎
Lines changed: 0 additions & 7 deletions b/‎README.md‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎antares/antares_compiler.py‎
Lines changed: 2 additions & 2 deletions b/‎antares/antares_compiler.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎antares/default_codegen.py‎
Lines changed: 2 additions & 2 deletions b/‎antares/default_codegen.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎antares/run.sh‎
Lines changed: 1 addition & 1 deletion b/‎antares/run.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/c-mcpu/schedule/standard/default.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/c-mcpu/schedule/standard/default.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/c-ocl_amdgpu/config.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/c-ocl_amdgpu/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/c-rocm/schedule/standard/default.py‎
Lines changed: 5 additions & 2 deletions b/‎backends/c-rocm/schedule/standard/default.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/c-sycl_intel/include/backend.hpp‎
Lines changed: 9 additions & 5 deletions b/‎backends/c-sycl_intel/include/backend.hpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎docker/Dockerfile.c-base‎
Lines changed: 4 additions & 7 deletions b/‎docker/Dockerfile.c-base‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎frameworks/pytorch/custom_op.py‎
Lines changed: 0 additions & 148 deletions b/‎frameworks/pytorch/custom_op.py‎
Lines changed: 0 additions & 148 deletions
@@ -61,13 +61,6 @@ BACKEND=c-cuda STEP=2000 COMPUTE_V1='- S = 512; einstein_v2(input_dict={"input0"
 # Cleanup history caches:
 antares clean
 
-# Boot HTTP daemon for accepting searching tasks:
-antares rest-server
-
-# Setup Plugin for Pytorch && Examples:
-BACKEND=c-cuda antares torch-setup
-BACKEND=c-mcpu antares torch-setup
-python3 -m antares_core.frameworks.pytorch.examples.1_hello_world
 ```
 
 ## Contributing
 
@@ -676,14 +676,14 @@ def measure_batch(inputs):
   eval_client.init(backend_root=backend_root)
   dev_id = int(os.environ.get('DEV_ID', '0'))
 
-  if (save_path and dump_path) is None and os.environ.get("FUNC_NAME", None) and '// [metadata] ' in device_source:
+  if (save_path and dump_path) is None and os.environ.get("TORCH_FN", None) and '// [metadata] ' in device_source:
     AntaresGlobal.device_source = device_source
     metadata = device_source.index('// [metadata] ')
     metadata = device_source[metadata:device_source.index('\n', metadata)].split()[-1].encode('utf-8')
     hex_code = device_source.encode('utf-8')
     if int(os.environ.get('CODE_DEBUG', 0)) == 0:
       hex_code = binascii.unhexlify(eval_client.eval(kernel_path=kernel_path, dev_id=(fix_device_id if fix_device_id >= 0 else dev_id), backend_root=backend_root, compile=1)['HEX'][1:-1])
-    with open(get_real_path(os.environ["FUNC_NAME"] + ".mod"), 'wb') as fp:
+    with open(get_real_path(os.environ["TORCH_FN"] + ".mod"), 'wb') as fp:
       fp.write(metadata)
       fp.write(hex_code)
 
 
@@ -354,5 +354,5 @@ def translate_code(code, config):
   kernel_slices = translate_code(func.imported_modules[0].get_source(), best_config)
   return kernel_slices
 
-if int(os.environ.get('TVM', 1)) == 0:
-  from next_codegen import codegen
+if len(os.environ.get('TORCH_FN', '')) > 0:
+  from torch_codegen.torch_codegen import codegen
@@ -51,7 +51,7 @@ if [[ "${TVM}" != "0" ]] && ( [[ "$(cat ${TVM_HOME}/VERSION_TAG 2>/dev/null)" !=
 fi
 
 if [[ "$COMPUTE_V1" == "" ]]; then
-  export COMPUTE_V1='- einstein_v2("output0[N, M] = input0[N, M] + input1[N, M]", input_dict={"input0": {"dtype": "float32", "shape": [1024, 512]}, "input1": {"dtype": "float32", "shape": [1024, 512]}})'
+  export COMPUTE_V1='- N = 1024 * 1024 * 64; einstein_v2("output0[N] = input0[N].call(`max`, const(0, dtype=input0.dtype()))", {"input0": {"dtype": "float32", "shape": [N]}})' antares
 fi
 
 mkdir -p ${ANTARES_DRIVER_PATH}
 
@@ -15,7 +15,7 @@ def mcpu_auto_schedule(s, output, prefix):
     for i in range(len(output.op.reduce_axis)):
       slice_reduce.append(cfg.define_split(f"{prefix}:R{i}", attrs.get_extent(output.op.reduce_axis[i]), num_outputs=2, init_vals=[[-1, 4],]))
 
-    unroll = cfg.define_knob(f"{prefix}:UN", [1, 4, 8, 16, 32, 64], init_vals=[1,] if attrs.backend == 'c-mcpu_avx512' else [0,])
+    unroll = cfg.define_knob(f"{prefix}:UN", [1, 4, 8, 16, 32, 64], init_vals=[1,] if attrs.backend != 'c-mcpu' else [0,])
 
     output_local, = s.cache_write([output], "local")
 
 
@@ -24,7 +24,7 @@ def get_execution_parallism():
 
 def do_native_translation_v2(codeset, **kwargs):
   kernel_name, in_args, out_args, body = codeset
-  expand_args = ', '.join([f'__global {x[0]}* {x[1]}' for x in in_args + out_args])
+  expand_args = ', '.join([f'__global {x[0]}* __restrict {x[1]}' for x in in_args + out_args])
   if 'VAMAP' in os.environ:
     expand_args += ', ' + ', '.join([f'int {x.split(":")[0]}' if '/_' not in x.split(":")[0] else x.split(":")[0].replace('/', ' ') for x in os.environ['VAMAP'].split(',')])
 
 
@@ -39,8 +39,11 @@ def schedule(attrs):
   attrs.advanced_sched = config or step > 0
   tail_op, explicit_ops = None, [x for x in attrs.explicit_ops]
 
-  if (len(explicit_ops) > 1 and
-      not explicit_ops[-1].output(0).op.reduce_axis):
+  red = int(os.environ.get('RED', -1))
+  if red == -1:
+    red = (len(explicit_ops) > 1 and not explicit_ops[-1].output(0).op.reduce_axis)
+
+  if red:
     fuse_tail = attrs.auto_config.define_knob(f"FU", [False, True])
     tail_op = explicit_ops[-1]
     if fuse_tail:
 
@@ -19,7 +19,7 @@ namespace ab {
 
   void init(int dev) {
     try {
-      if (__BACKEND__ == "c-sycl_intel")
+      if (__BACKEND__ != "c-sycl_cuda")
         _sycl_queue = std::move(sycl::queue(sycl::default_selector{}));
       else {
         // for SYCL CUDA, select the i-th GPU device
@@ -41,6 +41,10 @@ namespace ab {
     } catch (sycl::exception const &e) {
       std::terminate();
     }
+
+    int steps = getenv("STEP") ? std::atoi(getenv("STEP")) : 0;
+    if (steps > 0)
+      return;
     size_t max_compute_units = _sycl_queue.get_device().get_info<cl::sycl::info::device::max_compute_units>();
     size_t max_work_group_size = _sycl_queue.get_device().get_info<cl::sycl::info::device::max_work_group_size>();
     size_t max_mem_alloc_size = _sycl_queue.get_device().get_info<cl::sycl::info::device::max_mem_alloc_size>();
@@ -61,7 +65,7 @@ namespace ab {
       it.pop_back();
       return dptr;
     }
-    // if (__BACKEND__ == "c-sycl_intel")
+    // if (__BACKEND__ != "c-sycl_cuda")
     //   return memalign(sysconf(_SC_PAGESIZE), byteSize);
     return sycl::malloc_device(byteSize, _sycl_queue);
   }
@@ -75,8 +79,8 @@ namespace ab {
     ab_utils::TempFile tempfile("cpp", source);
     auto path = tempfile.get_path();
 
-    if (__BACKEND__ == "c-sycl_intel")
-      ab_utils::Process({"dpcpp", path, "-std=c++17", "-lpthread", "-fPIC", "-shared", "-Wno-pass-failed", "-O3", "-ffast-math", "-march=native", "-o", path + ".out"}, 10);
+    if (__BACKEND__ != "c-sycl_cuda")
+      ab_utils::Process({"dpcpp", path, "-std=c++17", "-lpthread", "-fPIC", "-shared", "-Wno-pass-failed", "-O3", "-ffast-math", "-Wno-deprecated", "-march=native", "-o", path + ".out"}, 10);
     else {
       std::string gpu_arch = "50"; // Corresponds to the back-end default.
 #ifdef SYCL_CUDA
@@ -142,7 +146,7 @@ namespace ab {
     }
 
     ((void(*)(void*, long, void* const*))hFunc[0])(&_sycl_queue, attrs, krnl_args.data());
-    if (__BACKEND__ == "c-sycl_intel") // have to sync unlike CUDA
+    if (__BACKEND__ != "c-sycl_cuda") // have to sync except CUDA
        _sycl_queue.wait();
   }
 
 
@@ -16,14 +16,11 @@ RUN apt-get update && apt install -y --no-install-recommends git ca-certificates
 RUN /bin/echo -e "set backspace=indent,eol,start\nset nocompatible\nset ts=4" > /etc/vim/vimrc.tiny
 
 ADD ./engine /antares/engine
-RUN NO_PYTHON=1 /antares/engine/install_antares_host.sh && rm -rf /var/lib/apt/lists/* ~/.cache
-RUN bash -c 'rm -rf ~/.local/antares/3rdparty/tvm/build/{CMake*,Makefile,cmake_install.cmake}'
-RUN bash -c 'rm -rf ~/.local/antares/3rdparty/tvm/{src,include,golang,tests,3rdparty,device-stub,apps,conda,docker,docs,gallery,jvm,nnvm,rust,vta,web,cmake,.??*}'
-RUN echo '' > ~/.local/antares/3rdparty/tvm/python/tvm/relay/__init__.py
 
-ENV ANTARES_VERSION 0.3.23.2
+ENV ANTARES_VERSION 0.9.0
 
-RUN cd ~ && git clone https://github.com/microsoft/antares --branch latest --single-branch --depth 1 antares_core && mv ~/.local/antares/3rdparty antares_core
+RUN cd ~ && git clone https://github.com/microsoft/antares --branch latest --single-branch --depth 1 antares_core
+RUN cd ~ && curl -LO https://github.com/microsoft/antares/releases/download/v0.3.20/3rdparty.tar.gz && mkdir antares_core/3rdparty && tar xzvf 3rdparty.tar.gz -C antares_core/3rdparty >/dev/null 2>&1
 RUN cd ~ && sed -i "s/@VERSION@/${ANTARES_VERSION}/g" /antares/engine/dist-info/METADATA && cp -r /antares/engine/dist-info ~/antares-${ANTARES_VERSION}.dist-info
-RUN cd ~ && touch antares_core/__init__.py
+RUN cd ~ && touch antares_core/__init__.py && mv /antares/engine/torch_codegen antares_core/antares 2>/dev/null 2>&1 || true
 RUN cd ~ && rm -rf antares_core/.??* && zip -r /antares-${ANTARES_VERSION}-py3-none-manylinux1_x86_64.whl antares* >/dev/null