Merge branch 'master' into release/0.8

Guanheng Zhang · Guanheng Zhang · commit cd6902dce5c4 · 2020-10-26T11:06:45.000-07:00
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
@@ -5,7 +5,6 @@ dependencies:
   - codecov
   - pip
   - pip:
-    - clang-format
     - dataclasses
     - nltk
     - requests
diff --git a/.circleci/unittest/linux/scripts/run_style_checks.sh b/.circleci/unittest/linux/scripts/run_style_checks.sh
@@ -20,9 +20,9 @@ if [ "${status}" -ne 0 ]; then
 fi
 
 printf "\x1b[34mRunning clang-format: "
-clang-format --version
+./clang-format --version
 printf "\x1b[0m\n"
-git-clang-format origin/master
+git-clang-format --binary ./clang-format origin/master
 git diff --exit-code
 status=$?
 exit_status="$((exit_status+status))"
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -14,6 +14,11 @@ env_dir="${root_dir}/env"
 
 cd "${root_dir}"
 
+case "$(uname -s)" in
+    Darwin*) os=MacOSX;;
+    *) os=Linux
+esac
+
 # 1. Install conda at ./conda
 if [ ! -d "${conda_dir}" ]; then
     printf "* Installing conda\n"
@@ -32,6 +37,11 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
+if [ "${os}" == Linux ] ; then
+    clangformat_path="${root_dir}/clang-format"
+    curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o "${clangformat_path}"
+    chmod +x "${clangformat_path}"
+fi
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
diff --git a/README.rst b/README.rst
@@ -15,8 +15,11 @@ This repository consists of:
 * `torchtext.data <#data>`_: Generic data loaders, abstractions, and iterators for text (including vocabulary and word vectors)
 * `torchtext.datasets <#datasets>`_: Pre-built loaders for common NLP datasets
 
-Note: we are currently re-designing the torchtext library to make it more compatible with pytorch (e.g. ``torch.utils.data``). Several datasets have been written with the new abstractions in `torchtext.experimental <https://github.com/pytorch/text/tree/master/torchtext/experimental>`_ folder. We also created an issue to discuss the new abstraction, and users are welcome to leave feedback `link <https://github.com/pytorch/text/issues/664>`_. 
+Note: we are currently re-designing the torchtext library to make it more compatible with pytorch (e.g. ``torch.utils.data``). Several datasets have been written with the new abstractions in `torchtext.experimental <https://github.com/pytorch/text/tree/master/torchtext/experimental>`_ folder. We also created an issue to discuss the new abstraction, and users are welcome to leave feedback `link <https://github.com/pytorch/text/issues/664>`_. These prototype building blocks and datasets in the experimental folder are available in the nightly release only. The nightly packages are accessible via Pip and Conda for Windows, Mac, and Linux. For example, Linux users can install the nightly wheels with the following command::
 
+    pip install --pre torch torchtext -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html  
+
+For more detail instructions, please refer to `Install PyTorch <https://pytorch.org/get-started/locally/>`_. It should be noted that the new building blocks are still under development, and the APIs have not been solidified.
 
 Installation
 ============
@@ -28,15 +31,17 @@ We recommend Anaconda as Python package management system. Please refer to `pyto
    :widths: 10, 10, 10
 
    nightly build, master, 3.6+
-   1.5, 0.5, 3.5+
-   1.4, 0.4, "2.7, 3.5+"
+   1.7, 0.8, 3.6+
+   1.6, 0.7, 3.6+
+   1.5, 0.6, 3.5+
+   1.4, 0.5, "2.7, 3.5+"
    0.4 and below, 0.2.3, "2.7, 3.5+"
 
-Using conda;::
+Using conda::
 
     conda install -c pytorch torchtext
 
-Using pip;::
+Using pip::
 
     pip install torchtext
 
@@ -64,7 +69,13 @@ To build torchtext from source, you need ``git``, ``CMake`` and C++11 compiler s
     git clone https://github.com/pytorch/text torchtext
     cd torchtext
     git submodule update --init --recursive
+
+    # Linux
     python setup.py clean install
+
+    # OSX
+    MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py clean install
+
     # or ``python setup.py develop`` if you are making modifications.
 
 **Note**
diff --git a/setup.py b/setup.py
@@ -82,7 +82,7 @@ def run(self):
     license='BSD',
 
     install_requires=[
-        'tqdm', 'requests', 'torch', 'numpy', 'sentencepiece'
+        'tqdm', 'requests', 'torch', 'numpy'
     ],
     python_requires='>=3.5',
     classifiers=[
diff --git a/test/data/test_functional.py b/test/data/test_functional.py
@@ -4,7 +4,6 @@
 import uuid
 import unittest
 
-import sentencepiece as spm
 import torch
 from torchtext.data.functional import (
     generate_sp_model,
@@ -38,11 +37,8 @@ def test_generate_sp_model(self):
             model_prefix = os.path.join(dir_name, f'spm_user_{uuid.uuid4()}')
             model_file = f'{model_prefix}.model'
             generate_sp_model(data_path, vocab_size=23456, model_prefix=model_prefix)
-
-            sp_user = spm.SentencePieceProcessor()
-            sp_user.Load(model_file)
-
-            self.assertEqual(len(sp_user), 23456)
+            sp_model = load_sp_model(model_file)
+            self.assertEqual(sp_model.GetPieceSize(), 23456)
 
     def test_sentencepiece_numericalizer(self):
         test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'