Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@
"df = df.drop(labels=['label'])\n",
"\n",
"print(\"Number of lines in the DF = {:,}\".format(len(df)))\n",
"df.head(5).to_pandas()"
"df.head(5).to_pandas()\n",
"\n",
"## keeping only 90% because of the books\n",
"## because of temporary 1.5x overhead \n",
"## introduced in 0.12\n",
"## this will away with 0.13\n",
"df = df.head(int(len(df)*0.90))"
]
},
{
Expand Down Expand Up @@ -328,7 +334,7 @@
"metadata": {},
"outputs": [],
"source": [
"text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean'].data, STOPWORDS, ' ')\n",
"text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean']._column.nvstrings, STOPWORDS, ' ')\n",
"text_col_sample['text_clean'].to_pandas()"
]
},
Expand Down Expand Up @@ -394,7 +400,7 @@
" \n",
" # remove stopwords\n",
" stopwords_gpu = nvstrings.to_device(stopwords)\n",
" input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')\n",
" input_strs = nvtext.replace_tokens(input_strs._column.nvstrings, stopwords_gpu, ' ')\n",
" input_strs = cudf.Series(input_strs)\n",
" \n",
" # replace multiple spaces with single one and strip leading/trailing spaces\n",
Expand Down Expand Up @@ -464,7 +470,7 @@
" df = cudf.DataFrame()\n",
" # tokenize sentences into a string using nvtext.tokenize()\n",
" # it into a single tall data-frame\n",
" df['string'] = nvtext.tokenize(str_col.data)\n",
" df['string'] = nvtext.tokenize(str_col._column.nvstrings)\n",
" \n",
" # Using Group by to do a value count for string columns\n",
" # This will be natively supported soon\n",
Expand Down Expand Up @@ -634,8 +640,8 @@
" \"\"\"\n",
" import rmm\n",
" \n",
" cat = nvcategory.from_strings(str_s.data).set_keys(keys)\n",
" device_array = rmm.device_array(str_s.data.size(), dtype=np.int32) \n",
" cat = nvcategory.from_strings(str_s._column.nvstrings).set_keys(keys)\n",
" device_array = rmm.device_array(str_s._column.nvstrings.size(), dtype=np.int32) \n",
" cat.values(devptr=device_array.device_ctypes_pointer.value)\n",
" \n",
" return cudf.Series(device_array)\n",
Expand Down Expand Up @@ -674,7 +680,7 @@
"%%time\n",
"# keep only top 20k words in the dataset\n",
"th = 20_000\n",
"keys = count_df['string'][:th].data\n",
"keys = count_df['string'][:th]._column.nvstrings\n",
"encoded_wc_ls = []\n",
"\n",
"for auth_wc_df in author_wc_ls:\n",
Expand Down Expand Up @@ -805,8 +811,8 @@
" count_sr = encoded_wc_df['counts']\n",
" token_id_sr = encoded_wc_df['encoded_str_id']\n",
" \n",
" count_ar = count_sr.data.to_gpu_array()\n",
" token_id_ar = token_id_sr.data.to_gpu_array()\n",
" count_ar = count_sr.to_gpu_array()\n",
" token_id_ar = token_id_sr.to_gpu_array()\n",
" author_ar = count_dary[author_id]\n",
" \n",
" # See https://numba.pydata.org/numba-doc/0.13/CUDAJit.html\n",
Expand Down Expand Up @@ -1015,7 +1021,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.6"
}
},
"nbformat": 4,
Expand Down