|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": null, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {}, |
7 | 7 | "outputs": [], |
8 | 8 | "source": [ |
|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": null, |
| 14 | + "execution_count": 2, |
15 | 15 | "metadata": {}, |
16 | 16 | "outputs": [], |
17 | 17 | "source": [ |
18 | | - "# 下载数据:https://osf.io/rduj2" |
| 18 | + "# 下载两份数据:https://osf.io/5mk3x, https://osf.io/m48ed\n", |
| 19 | + "# 将两份数据解压到当前目录下的data文件夹中\n", |
| 20 | + "# data目录结构如下:\n", |
| 21 | + "# data/\n", |
| 22 | + "# |--train_10M/\n", |
| 23 | + "# |--dev/" |
19 | 24 | ] |
20 | 25 | }, |
21 | 26 | { |
|
36 | 41 | }, |
37 | 42 | { |
38 | 43 | "cell_type": "code", |
39 | | - "execution_count": 1, |
| 44 | + "execution_count": 3, |
40 | 45 | "metadata": {}, |
41 | 46 | "outputs": [], |
42 | 47 | "source": [ |
43 | 48 | "from pathlib import Path\n", |
44 | | - "from mrclean import *" |
| 49 | + "from mrclean import *\n", |
| 50 | + "import os" |
45 | 51 | ] |
46 | 52 | }, |
47 | 53 | { |
48 | 54 | "cell_type": "code", |
49 | | - "execution_count": 15, |
| 55 | + "execution_count": 4, |
50 | 56 | "metadata": {}, |
51 | 57 | "outputs": [], |
52 | 58 | "source": [ |
53 | | - "DATA_ROOT = Path(\"F:/llm-deploy-data/data/Babyllama\")\n", |
| 59 | + "DATA_ROOT = Path(\"./data\")\n", |
54 | 60 | "SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning\n", |
55 | | - "DATA_SPLITS = ['babylm_10M', 'babylm_dev']\n", |
| 61 | + "DATA_SPLITS = ['train_10M', 'dev']\n", |
56 | 62 | "\n", |
57 | 63 | "CLEANUP_FUNCTIONS = {\n", |
58 | 64 | " 'aochildes': cleanup_aochildes,\n", |
|
70 | 76 | }, |
71 | 77 | { |
72 | 78 | "cell_type": "code", |
73 | | - "execution_count": 16, |
| 79 | + "execution_count": 5, |
74 | 80 | "metadata": {}, |
75 | 81 | "outputs": [ |
76 | 82 | { |
77 | 83 | "name": "stdout", |
78 | 84 | "output_type": "stream", |
79 | 85 | "text": [ |
80 | | - "🧹 Cleaned 'bnc_spoken.train' (size 4883879 -> 4851676) in babylm_10M\n", |
81 | | - "🧹 Cleaned 'childes.train' (size 15482927 -> 15482927) in babylm_10M\n", |
82 | | - "🧹 Cleaned 'gutenberg.train' (size 13910986 -> 13910986) in babylm_10M\n", |
83 | | - "🧹 Cleaned 'open_subtitles.train' (size 10806305 -> 10804026) in babylm_10M\n", |
84 | | - "🧹 Cleaned 'simple_wiki.train' (size 8411630 -> 8387062) in babylm_10M\n", |
85 | | - "🧹 Cleaned 'switchboard.train' (size 719322 -> 719322) in babylm_10M\n", |
86 | | - "🧹 Cleaned 'bnc_spoken.dev' (size 6538139 -> 6503778) in babylm_dev\n", |
87 | | - "🧹 Cleaned 'childes.dev' (size 14638378 -> 14638378) in babylm_dev\n", |
88 | | - "🧹 Cleaned 'gutenberg.dev' (size 15490473 -> 15490473) in babylm_dev\n", |
89 | | - "🧹 Cleaned 'open_subtitles.dev' (size 11016133 -> 11014854) in babylm_dev\n", |
90 | | - "🧹 Cleaned 'simple_wiki.dev' (size 8149513 -> 8128239) in babylm_dev\n", |
91 | | - "🧹 Cleaned 'switchboard.dev' (size 724013 -> 724013) in babylm_dev\n" |
| 86 | + "🧹 Cleaned 'childes.train' (size 15482927 -> 15482927) in train_10M\n", |
| 87 | + "🧹 Cleaned 'simple_wiki.train' (size 8411630 -> 8387062) in train_10M\n", |
| 88 | + "🧹 Cleaned 'bnc_spoken.train' (size 4883879 -> 4851676) in train_10M\n", |
| 89 | + "🧹 Cleaned 'gutenberg.train' (size 13910986 -> 13910986) in train_10M\n", |
| 90 | + "🧹 Cleaned 'switchboard.train' (size 719322 -> 719322) in train_10M\n", |
| 91 | + "🧹 Cleaned 'open_subtitles.train' (size 10806305 -> 10804026) in train_10M\n", |
| 92 | + "🧹 Cleaned 'switchboard.dev' (size 724013 -> 724013) in dev\n", |
| 93 | + "🧹 Cleaned 'simple_wiki.dev' (size 8149513 -> 8128239) in dev\n", |
| 94 | + "🧹 Cleaned 'gutenberg.dev' (size 15490473 -> 15490473) in dev\n", |
| 95 | + "🧹 Cleaned 'bnc_spoken.dev' (size 6538139 -> 6503778) in dev\n", |
| 96 | + "🧹 Cleaned 'open_subtitles.dev' (size 11016133 -> 11014854) in dev\n", |
| 97 | + "🧹 Cleaned 'childes.dev' (size 14638378 -> 14638378) in dev\n" |
92 | 98 | ] |
93 | 99 | } |
94 | 100 | ], |
|
117 | 123 | }, |
118 | 124 | { |
119 | 125 | "cell_type": "code", |
120 | | - "execution_count": 17, |
| 126 | + "execution_count": 6, |
121 | 127 | "metadata": {}, |
122 | 128 | "outputs": [], |
123 | 129 | "source": [ |
|
129 | 135 | }, |
130 | 136 | { |
131 | 137 | "cell_type": "code", |
132 | | - "execution_count": 18, |
| 138 | + "execution_count": 7, |
133 | 139 | "metadata": {}, |
134 | 140 | "outputs": [ |
135 | 141 | { |
|
142 | 148 | ], |
143 | 149 | "source": [ |
144 | 150 | "# We train the tokenizer on the train data only\n", |
145 | | - "data_dir = Path(\"F:/llm-deploy-data/data/Babyllama/babylm_10M_clean/\")\n", |
| 151 | + "data_dir = Path(\"./data/train_10M_clean/\")\n", |
146 | 152 | "\n", |
147 | 153 | "paths = [str(f) for f in data_dir.glob(\"*\") if f.is_file() and not f.name.endswith(\".DS_Store\") and f.suffix in [\".train\"]]\n", |
148 | 154 | "\n", |
|
153 | 159 | }, |
154 | 160 | { |
155 | 161 | "cell_type": "code", |
156 | | - "execution_count": 19, |
| 162 | + "execution_count": 8, |
157 | 163 | "metadata": {}, |
158 | 164 | "outputs": [], |
159 | 165 | "source": [ |
|
167 | 173 | }, |
168 | 174 | { |
169 | 175 | "cell_type": "code", |
170 | | - "execution_count": 20, |
| 176 | + "execution_count": 9, |
171 | 177 | "metadata": {}, |
172 | | - "outputs": [], |
| 178 | + "outputs": [ |
| 179 | + { |
| 180 | + "name": "stdout", |
| 181 | + "output_type": "stream", |
| 182 | + "text": [ |
| 183 | + "\n", |
| 184 | + "\n", |
| 185 | + "\n" |
| 186 | + ] |
| 187 | + } |
| 188 | + ], |
173 | 189 | "source": [ |
174 | 190 | "trainer = trainers.BpeTrainer(vocab_size=16000, min_frequency=2, special_tokens=[\"<pad>\", \"<s>\", \"</s>\"])\n", |
175 | 191 | "tokenizer.train(paths, trainer)" |
176 | 192 | ] |
177 | 193 | }, |
178 | 194 | { |
179 | 195 | "cell_type": "code", |
180 | | - "execution_count": 22, |
| 196 | + "execution_count": 10, |
181 | 197 | "metadata": {}, |
182 | 198 | "outputs": [], |
183 | 199 | "source": [ |
184 | | - "tokenizer_path = DATA_ROOT / \"models/gpt-clean-16000.json\"\n", |
| 200 | + "tokenizer_path = \"./models/gpt-clean-16000.json\"\n", |
| 201 | + "os.makedirs(\"models\", exist_ok=True)\n", |
185 | 202 | "tokenizer.save(str(tokenizer_path), pretty=True)" |
186 | 203 | ] |
187 | 204 | }, |
|
194 | 211 | }, |
195 | 212 | { |
196 | 213 | "cell_type": "code", |
197 | | - "execution_count": 23, |
| 214 | + "execution_count": 11, |
198 | 215 | "metadata": {}, |
199 | 216 | "outputs": [ |
200 | 217 | { |
201 | 218 | "name": "stdout", |
202 | 219 | "output_type": "stream", |
203 | 220 | "text": [ |
204 | 221 | "Encoded String: ['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']\n", |
205 | | - "Encoded IDs: [302, 1784, 3266, 5712, 15961, 541, 190, 11553, 1469, 16]\n", |
| 222 | + "Encoded IDs: [300, 1782, 3264, 5710, 15959, 539, 188, 11551, 1467, 16]\n", |
206 | 223 | "Decoded String: The quick brown fox jumps over the lazy dog.\n" |
207 | 224 | ] |
208 | 225 | } |
|
248 | 265 | "name": "python", |
249 | 266 | "nbconvert_exporter": "python", |
250 | 267 | "pygments_lexer": "ipython3", |
251 | | - "version": "3.9.18" |
| 268 | + "version": "3.9.20" |
252 | 269 | }, |
253 | 270 | "orig_nbformat": 4 |
254 | 271 | }, |
|
0 commit comments