rapidsai-community · VibhuJawa · Nov 28, 2019 · Nov 28, 2019 · Nov 28, 2019 · Nov 28, 2019
diff --git a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb
@@ -191,7 +191,13 @@
     "df = df.drop(labels=['label'])\n",
     "\n",
     "print(\"Number of lines in the DF = {:,}\".format(len(df)))\n",
-    "df.head(5).to_pandas()"
+    "df.head(5).to_pandas()\n",
+    "\n",
+    "## keeping only 90%  because of the books\n",
+    "## because of temporary 1.5x overhead \n",
+    "## introduced in 0.12\n",
+    "## this will away with 0.13\n",
+    "df = df.head(int(len(df)*0.90))"
    ]
   },
   {
@@ -328,7 +334,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean'].data, STOPWORDS, ' ')\n",
+    "text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean']._column.nvstrings, STOPWORDS, ' ')\n",
     "text_col_sample['text_clean'].to_pandas()"
    ]
   },
@@ -394,7 +400,7 @@
     "        \n",
     "    # remove stopwords\n",
     "    stopwords_gpu = nvstrings.to_device(stopwords)\n",
-    "    input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')\n",
+    "    input_strs = nvtext.replace_tokens(input_strs._column.nvstrings, stopwords_gpu, ' ')\n",
     "    input_strs = cudf.Series(input_strs)\n",
     "        \n",
     "    # replace multiple spaces with single one and strip leading/trailing spaces\n",
@@ -464,7 +470,7 @@
     "    df = cudf.DataFrame()\n",
     "    # tokenize sentences into a string using nvtext.tokenize()\n",
     "    # it into a single tall data-frame\n",
-    "    df['string'] = nvtext.tokenize(str_col.data)\n",
+    "    df['string'] = nvtext.tokenize(str_col._column.nvstrings)\n",
     "    \n",
     "    # Using Group by to do a value count for string columns\n",
     "    # This will be natively supported soon\n",
@@ -634,8 +640,8 @@
     "    \"\"\"\n",
     "    import rmm\n",
     "    \n",
-    "    cat = nvcategory.from_strings(str_s.data).set_keys(keys)\n",
-    "    device_array = rmm.device_array(str_s.data.size(), dtype=np.int32)    \n",
+    "    cat = nvcategory.from_strings(str_s._column.nvstrings).set_keys(keys)\n",
+    "    device_array = rmm.device_array(str_s._column.nvstrings.size(), dtype=np.int32)    \n",
     "    cat.values(devptr=device_array.device_ctypes_pointer.value)\n",
     "    \n",
     "    return cudf.Series(device_array)\n",
@@ -674,7 +680,7 @@
     "%%time\n",
     "# keep only top 20k words in the dataset\n",
     "th = 20_000\n",
-    "keys = count_df['string'][:th].data\n",
+    "keys = count_df['string'][:th]._column.nvstrings\n",
     "encoded_wc_ls = []\n",
     "\n",
     "for auth_wc_df in author_wc_ls:\n",
@@ -805,8 +811,8 @@
     "    count_sr = encoded_wc_df['counts']\n",
     "    token_id_sr =  encoded_wc_df['encoded_str_id']\n",
     "    \n",
-    "    count_ar = count_sr.data.to_gpu_array()\n",
-    "    token_id_ar = token_id_sr.data.to_gpu_array()\n",
+    "    count_ar = count_sr.to_gpu_array()\n",
+    "    token_id_ar = token_id_sr.to_gpu_array()\n",
     "    author_ar = count_dary[author_id]\n",
     "    \n",
     "    # See https://numba.pydata.org/numba-doc/0.13/CUDAJit.html\n",
@@ -1015,7 +1021,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.7.6"
   }
  },
  "nbformat": 4,