|
28 | 28 | },
|
29 | 29 | {
|
30 | 30 | "cell_type": "code",
|
31 |
| - "execution_count": null, |
| 31 | + "execution_count": 1, |
32 | 32 | "id": "d7570b2e",
|
33 | 33 | "metadata": {},
|
34 | 34 | "outputs": [],
|
|
51 | 51 | },
|
52 | 52 | {
|
53 | 53 | "cell_type": "code",
|
54 |
| - "execution_count": null, |
| 54 | + "execution_count": 2, |
55 | 55 | "id": "20dcdaf7",
|
56 | 56 | "metadata": {},
|
57 | 57 | "outputs": [],
|
|
93 | 93 | },
|
94 | 94 | {
|
95 | 95 | "cell_type": "code",
|
96 |
| - "execution_count": null, |
| 96 | + "execution_count": 3, |
97 | 97 | "id": "0806d2db",
|
98 | 98 | "metadata": {},
|
99 |
| - "outputs": [], |
| 99 | + "outputs": [ |
| 100 | + { |
| 101 | + "name": "stdout", |
| 102 | + "output_type": "stream", |
| 103 | + "text": [ |
| 104 | + "Type of server: zilliz_cloud\n" |
| 105 | + ] |
| 106 | + } |
| 107 | + ], |
100 | 108 | "source": [
|
101 | 109 | "from pymilvus import connections, utility\n",
|
102 | 110 | "\n",
|
|
134 | 142 | },
|
135 | 143 | {
|
136 | 144 | "cell_type": "code",
|
137 |
| - "execution_count": null, |
| 145 | + "execution_count": 4, |
138 | 146 | "id": "dd2be7fd",
|
139 | 147 | "metadata": {},
|
140 |
| - "outputs": [], |
| 148 | + "outputs": [ |
| 149 | + { |
| 150 | + "name": "stdout", |
| 151 | + "output_type": "stream", |
| 152 | + "text": [ |
| 153 | + "device: cpu\n", |
| 154 | + "<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>\n", |
| 155 | + "SentenceTransformer(\n", |
| 156 | + " (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel \n", |
| 157 | + " (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n", |
| 158 | + ")\n", |
| 159 | + "model_name: BAAI/bge-base-en-v1.5\n", |
| 160 | + "EMBEDDING_LENGTH: 768\n", |
| 161 | + "MAX_SEQ_LENGTH: 512\n" |
| 162 | + ] |
| 163 | + } |
| 164 | + ], |
141 | 165 | "source": [
|
142 | 166 | "# Import torch.\n",
|
143 | 167 | "import torch\n",
|
|
188 | 212 | },
|
189 | 213 | {
|
190 | 214 | "cell_type": "code",
|
191 |
| - "execution_count": null, |
192 |
| - "metadata": {}, |
193 |
| - "outputs": [], |
| 215 | + "execution_count": 5, |
| 216 | + "metadata": {}, |
| 217 | + "outputs": [ |
| 218 | + { |
| 219 | + "name": "stdout", |
| 220 | + "output_type": "stream", |
| 221 | + "text": [ |
| 222 | + "Embedding length: 768\n", |
| 223 | + "Created collection: MIlvusDocs\n", |
| 224 | + "Schema: {'auto_id': True, 'description': 'The schema for docs pages', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': True}\n" |
| 225 | + ] |
| 226 | + } |
| 227 | + ], |
194 | 228 | "source": [
|
195 | 229 | "from pymilvus import (\n",
|
196 | 230 | " FieldSchema, DataType, \n",
|
|
246 | 280 | },
|
247 | 281 | {
|
248 | 282 | "cell_type": "code",
|
249 |
| - "execution_count": null, |
250 |
| - "metadata": {}, |
251 |
| - "outputs": [], |
| 283 | + "execution_count": 6, |
| 284 | + "metadata": {}, |
| 285 | + "outputs": [ |
| 286 | + { |
| 287 | + "name": "stdout", |
| 288 | + "output_type": "stream", |
| 289 | + "text": [ |
| 290 | + "{'loading_progress': '100%'}\n" |
| 291 | + ] |
| 292 | + } |
| 293 | + ], |
252 | 294 | "source": [
|
253 | 295 | "# 5. Drop the index, in case it already exists.\n",
|
254 | 296 | "mc.drop_index()\n",
|
|
266 | 308 | " field_name=\"vector\", \n",
|
267 | 309 | " index_params=index_params)\n",
|
268 | 310 | "\n",
|
269 |
| - "# collection.load()\n", |
270 |
| - "\n", |
271 | 311 | "# Get loading progress\n",
|
| 312 | + "mc.load()\n", |
272 | 313 | "progress = utility.loading_progress(COLLECTION_NAME)\n",
|
273 | 314 | "print(progress)"
|
274 | 315 | ]
|
275 | 316 | },
|
276 | 317 | {
|
277 | 318 | "cell_type": "code",
|
278 |
| - "execution_count": null, |
| 319 | + "execution_count": 7, |
279 | 320 | "id": "6861beb7",
|
280 | 321 | "metadata": {},
|
281 |
| - "outputs": [], |
| 322 | + "outputs": [ |
| 323 | + { |
| 324 | + "name": "stdout", |
| 325 | + "output_type": "stream", |
| 326 | + "text": [ |
| 327 | + "loaded 15 documents\n" |
| 328 | + ] |
| 329 | + } |
| 330 | + ], |
282 | 331 | "source": [
|
283 | 332 | "## Read docs into LangChain\n",
|
284 | 333 | "#!pip install langchain \n",
|
|
309 | 358 | },
|
310 | 359 | {
|
311 | 360 | "cell_type": "code",
|
312 |
| - "execution_count": null, |
313 |
| - "metadata": {}, |
314 |
| - "outputs": [], |
| 361 | + "execution_count": 8, |
| 362 | + "metadata": {}, |
| 363 | + "outputs": [ |
| 364 | + { |
| 365 | + "name": "stdout", |
| 366 | + "output_type": "stream", |
| 367 | + "text": [ |
| 368 | + "chunking time: 0.01805710792541504\n", |
| 369 | + "docs: 15, split into: 15\n", |
| 370 | + "split into chunks: 159, type: list of <class 'langchain.schema.document.Document'>\n", |
| 371 | + "\n", |
| 372 | + "Looking at a sample chunk...\n", |
| 373 | + "{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'rtdocs/pymilvus.readthedocs.io/en/latest/install.html'}\n", |
| 374 | + "demonstrate how to install and using PyMilvus in a virtual environment. See virtualenv for more info\n" |
| 375 | + ] |
| 376 | + } |
| 377 | + ], |
315 | 378 | "source": [
|
316 | 379 | "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n",
|
317 | 380 | "\n",
|
|
384 | 447 | },
|
385 | 448 | {
|
386 | 449 | "cell_type": "code",
|
387 |
| - "execution_count": null, |
| 450 | + "execution_count": 9, |
388 | 451 | "id": "512130a3",
|
389 | 452 | "metadata": {},
|
390 |
| - "outputs": [], |
| 453 | + "outputs": [ |
| 454 | + { |
| 455 | + "name": "stdout", |
| 456 | + "output_type": "stream", |
| 457 | + "text": [ |
| 458 | + "{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'https://pymilvus.readthedocs.io/en/latest/install.html'}\n", |
| 459 | + "Installation¶ Installing via pip¶ PyMilvus is in the Python Package Index. PyMilvus only support pyt\n" |
| 460 | + ] |
| 461 | + } |
| 462 | + ], |
391 | 463 | "source": [
|
392 | 464 | "# Clean up the metadata urls\n",
|
393 | 465 | "for doc in chunks:\n",
|
|
413 | 485 | },
|
414 | 486 | {
|
415 | 487 | "cell_type": "code",
|
416 |
| - "execution_count": null, |
| 488 | + "execution_count": 10, |
417 | 489 | "metadata": {},
|
418 | 490 | "outputs": [],
|
419 | 491 | "source": [
|
|
445 | 517 | },
|
446 | 518 | {
|
447 | 519 | "cell_type": "code",
|
448 |
| - "execution_count": null, |
| 520 | + "execution_count": 11, |
449 | 521 | "id": "b51ff139",
|
450 | 522 | "metadata": {},
|
451 |
| - "outputs": [], |
| 523 | + "outputs": [ |
| 524 | + { |
| 525 | + "name": "stdout", |
| 526 | + "output_type": "stream", |
| 527 | + "text": [ |
| 528 | + "Start inserting entities\n", |
| 529 | + "Milvus insert time for 159 vectors: 1.0154786109924316 seconds\n", |
| 530 | + "(insert count: 159, delete count: 0, upsert count: 0, timestamp: 445785288603074562, success count: 159, err count: 0)\n", |
| 531 | + "[{\"name\":\"_default\",\"collection_name\":\"MIlvusDocs\",\"description\":\"\"}]\n" |
| 532 | + ] |
| 533 | + } |
| 534 | + ], |
452 | 535 | "source": [
|
453 | 536 | "# Insert a batch of data into the Milvus collection.\n",
|
454 | 537 | "\n",
|
|
503 | 586 | },
|
504 | 587 | {
|
505 | 588 | "cell_type": "code",
|
506 |
| - "execution_count": null, |
| 589 | + "execution_count": 12, |
507 | 590 | "id": "5e7f41f4",
|
508 | 591 | "metadata": {},
|
509 |
| - "outputs": [], |
| 592 | + "outputs": [ |
| 593 | + { |
| 594 | + "name": "stdout", |
| 595 | + "output_type": "stream", |
| 596 | + "text": [ |
| 597 | + "query length: 54\n" |
| 598 | + ] |
| 599 | + } |
| 600 | + ], |
510 | 601 | "source": [
|
511 | 602 | "# Define a sample question about your data.\n",
|
512 | 603 | "question = \"what is the default distance metric used in AUTOINDEX?\"\n",
|
|
534 | 625 | },
|
535 | 626 | {
|
536 | 627 | "cell_type": "code",
|
537 |
| - "execution_count": null, |
| 628 | + "execution_count": 13, |
538 | 629 | "id": "89642119",
|
539 | 630 | "metadata": {},
|
540 |
| - "outputs": [], |
| 631 | + "outputs": [ |
| 632 | + { |
| 633 | + "name": "stdout", |
| 634 | + "output_type": "stream", |
| 635 | + "text": [ |
| 636 | + "Loaded milvus collection into memory.\n", |
| 637 | + "Milvus search time: 0.06506514549255371 sec\n", |
| 638 | + "type: <class 'pymilvus.client.abstract.SearchResult'>, count: 5\n" |
| 639 | + ] |
| 640 | + } |
| 641 | + ], |
541 | 642 | "source": [
|
542 | 643 | "# RETRIEVAL USING MILVUS.\n",
|
543 | 644 | "\n",
|
|
587 | 688 | },
|
588 | 689 | {
|
589 | 690 | "cell_type": "code",
|
590 |
| - "execution_count": null, |
591 |
| - "metadata": {}, |
592 |
| - "outputs": [], |
| 691 | + "execution_count": 14, |
| 692 | + "metadata": {}, |
| 693 | + "outputs": [ |
| 694 | + { |
| 695 | + "name": "stdout", |
| 696 | + "output_type": "stream", |
| 697 | + "text": [ |
| 698 | + "2267\n" |
| 699 | + ] |
| 700 | + } |
| 701 | + ], |
593 | 702 | "source": [
|
594 | 703 | "# # TODO - remove this before saving in github.\n",
|
595 | 704 | "# for n, hits in enumerate(results):\n",
|
|
617 | 726 | },
|
618 | 727 | {
|
619 | 728 | "cell_type": "code",
|
620 |
| - "execution_count": null, |
| 729 | + "execution_count": 15, |
621 | 730 | "id": "3e7fa0b6",
|
622 | 731 | "metadata": {},
|
623 |
| - "outputs": [], |
| 732 | + "outputs": [ |
| 733 | + { |
| 734 | + "name": "stdout", |
| 735 | + "output_type": "stream", |
| 736 | + "text": [ |
| 737 | + "Question: what is the default distance metric used in AUTOINDEX?\n", |
| 738 | + "Answer: lazy dog\n" |
| 739 | + ] |
| 740 | + } |
| 741 | + ], |
624 | 742 | "source": [
|
625 | 743 | "# BASELINING THE LLM: ASK A QUESTION WITHOUT ANY RETRIEVED CONTEXT.\n",
|
626 | 744 | "\n",
|
|
649 | 767 | },
|
650 | 768 | {
|
651 | 769 | "cell_type": "code",
|
652 |
| - "execution_count": null, |
| 770 | + "execution_count": 16, |
653 | 771 | "id": "a68e87b1",
|
654 | 772 | "metadata": {},
|
655 |
| - "outputs": [], |
| 773 | + "outputs": [ |
| 774 | + { |
| 775 | + "name": "stdout", |
| 776 | + "output_type": "stream", |
| 777 | + "text": [ |
| 778 | + "Question: what is the default distance metric used in AUTOINDEX?\n", |
| 779 | + "Answer: MetricType.L2\n" |
| 780 | + ] |
| 781 | + } |
| 782 | + ], |
656 | 783 | "source": [
|
657 | 784 | "# NOW ASK THE SAME LLM THE SAME QUESTION USING THE RETRIEVED CONTEXT.\n",
|
658 | 785 | "QA_input = {\n",
|
|
673 | 800 | },
|
674 | 801 | {
|
675 | 802 | "cell_type": "code",
|
676 |
| - "execution_count": null, |
| 803 | + "execution_count": 17, |
677 | 804 | "id": "d0e81e68",
|
678 | 805 | "metadata": {},
|
679 | 806 | "outputs": [],
|
|
684 | 811 | },
|
685 | 812 | {
|
686 | 813 | "cell_type": "code",
|
687 |
| - "execution_count": null, |
| 814 | + "execution_count": 18, |
688 | 815 | "id": "c777937e",
|
689 | 816 | "metadata": {},
|
690 |
| - "outputs": [], |
| 817 | + "outputs": [ |
| 818 | + { |
| 819 | + "name": "stdout", |
| 820 | + "output_type": "stream", |
| 821 | + "text": [ |
| 822 | + "Author: Christy Bergman\n", |
| 823 | + "\n", |
| 824 | + "Python implementation: CPython\n", |
| 825 | + "Python version : 3.10.12\n", |
| 826 | + "IPython version : 8.15.0\n", |
| 827 | + "\n", |
| 828 | + "torch : 2.0.1\n", |
| 829 | + "transformers: 4.34.1\n", |
| 830 | + "milvus : 2.3.3\n", |
| 831 | + "pymilvus : 2.3.3\n", |
| 832 | + "langchain : 0.0.322\n", |
| 833 | + "\n", |
| 834 | + "conda environment: py310\n", |
| 835 | + "\n" |
| 836 | + ] |
| 837 | + } |
| 838 | + ], |
691 | 839 | "source": [
|
692 | 840 | "# Props to Sebastian Raschka for this handy watermark.\n",
|
693 | 841 | "# !pip install watermark\n",
|
|
0 commit comments