fdac19 · rhendz · Oct 4, 2019 · Oct 4, 2019
diff --git a/MongoDB.ipynb b/MongoDB.ipynb
diff --git a/apatel79.ipynb b/apatel79.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (<ipython-input-1-6d62dde7c1a8>, line 12)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-1-6d62dde7c1a8>\"\u001b[0;36m, line \u001b[0;32m12\u001b[0m\n\u001b[0;31m    coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackSample', 'license': 'CC w attribution', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacksample']})\u001b[0m\n\u001b[0m                                                                                                                                                                                                                                                                                         ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pprint\n",
+    "import re\n",
+    "import pymongo, json\n",
+    "\n",
+    "pp = pprint.PrettyPrinter(indent=1,width=65)\n",
+    "\n",
+    "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n",
+    "db = client ['fdac19mp2']\n",
+    "coll = db ['apatel79']\n",
+    "# for each dataset\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stackoverflow Data', 'license': 'CC', 'description': 'Updated on a quarterly basis, this BigQuery dataset includes an archive of Stack Overflow content, including posts, votes, tags, and badges. This dataset is updated to mirror the Stack Overflow content on the Internet Archive, and is also available through the Stack Exchange Data Explorer.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stackoverflow' ] } )\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackSample', 'license': 'CC w attribution', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacksample']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'R Questions from Stackoverflow', 'license': 'CC w attribution', 'description': 'Full text of questions and answers from Stack Overflow that are tagged with the r tag, useful for natural language processing and community analysis.', 'urls:' ['https://www.kaggle.com/stackoverflow/rquestions']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Python Questions from Stackoverflow', 'license': 'CC w attribution', 'description': 'Full text of all questions and answers from Stack Overflow that are tagged with the python tag. Useful for natural language processing and community analysis. See also the dataset of R questions.', 'urls:' ['https://www.kaggle.com/stackoverflow/pythonquestions']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackLite', 'license': 'Open Database', 'description': 'A dataset of Stack Overflow programming questions.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacklite']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stackoverflow Survey Results', 'license': 'Open Database', 'description': 'A dataset of favorite technologies to their job preference.', 'urls:' ['https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey']})\n",
+    "coll.insert ( { 'topic':'Pylint', 'title': 'Pylint Results for Python Code Snippets on Stack Overflow', 'license': 'Open Database', 'description': 'This dataset contains valid pylint results for all Stack Overflow code snippets from SOTorrent', 'urls:' ['https://zenodo.org/record/2558544#.XZdhGvcpAUs']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Developer Survey, 2017', 'license': 'Open Database', 'description': 'This dataset contains survey information like programming languages, salary, code style and various other information.', 'urls:' ['https://www.kaggle.com/stackoverflow/so-survey-2017']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Tag Network', 'license': 'CC BY-SA 3.0', 'description': 'stack_network_links contains links of the network, the source and target tech tags plus the value of the the link between each pair stack_network_nodes contains nodes of the network, the name of each node, which group that node belongs to (calculated via a cluster walktrap), and a node size based on how often that technology tag is used', 'urls:' ['https://www.kaggle.com/stackoverflow/stack-overflow-tag-network']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Contextual Documentation Referencing on Stack Overflow', 'license': 'CC BY-SA 4.0', 'description': 'This dataset contains contextual information in regards to documentation.', 'urls:' ['https://zenodo.org/record/2556643#.XZdh5vcpAUs']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'R and Python Stack Overflow Answers + Sentiment', 'license': 'Open Database', 'description': 'Stack Overflow answers by the Top 10 r and python users extracted using BigQuery. Also includes data on whether the answer was accepted and some additional data based on sentiment analysis of the answer text.', 'urls:' ['https://www.kaggle.com/ojwatson/stack-overflow-output']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'SOTorrent: Reconstructing and Analyzing the Evolution of Stack Overflow Posts', 'license': 'Open Database', 'description': 'SOTorrent provides access to the version history of Stack Overflow content at the level of whole posts and individual text or code blocks.', 'urls:' ['https://zenodo.org/record/1201554#.XZdjYvcpAUs']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Dataset with manually validated version histories of Stack Overflow posts', 'license': 'Open Database', 'description': 'This dataset contains information in regard to SOTorrent, specifically information regarding the versioning history.', 'urls:' ['https://zenodo.org/record/1477110#.XZdjafcpAUs']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Developer Survey Results 2019', 'license': 'CC BY-SA 4.0', 'description': 'This dataset contains survey results from 2019 regarding developers\\'s favorite technologies to their job preference.', 'urls:' ['https://www.kaggle.com/mchirico/stack-overflow-developer-survey-results-2019?source=post_page-----e0ae8335fdc4----------------------']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Questions from Cross Validated Stack Exchange', 'license': 'CC BY-SA 3.0', 'description': 'Full text of questions and answers from Cross Validated, the statistics and machine learning Q&A site from the Stack Exchange network.', 'urls:' ['https://www.kaggle.com/stackoverflow/statsquestions']})\n",
+    "coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Tag Prediction', 'license': 'CC0: Public Domain', 'description': 'Data that may be useful for NLP - it contains titles and tags', 'urls:' ['https://www.kaggle.com/badalgupta/stack-overflow-tag-prediction']})\n",
+    "coll.insert ( { 'topic':'Github', 'title': '452M commits on GitHub', 'license': 'CC BY-NC', 'description': 'Github commit data that contains repository name, git hash, author email, date/time, and commit message.', 'urls:' ['https://data.world/vmarkovtsev/452-m-commits-on-github']})\n",
+    "coll.insert ( { 'topic':'Github', 'title': 'Code review regression analysis of open source GitHub projects', 'license': 'CC BY-SA 4.0', 'description': 'This dataset was collected from GitHub, and includes 3,126 projects in 143 languages, with 489,038 issues and 382,771 pull requests.', 'urls:' ['https://datadryad.org/stash/dataset/doi:10.6078/D14X0T']})\n",
+    "coll.insert ( { 'topic':'Github', 'title': 'Top Starred Open Source Projects on GitHub', 'license': 'Public Domain', 'description': 'Github commit data that contains repository data - i.e. project popularity', 'urls:' ['https://data.world/chasewillden/top-starred-open-source-projects-on-github']})\n",
+    "coll.insert ( { 'topic':'Github', 'title': 'GitHub Duplicate Repositories with Source Repositories', 'license': 'Public Domain', 'description': 'Dataset that flags duplicate repositories and has attached source repositories - possible candidate for errors / miscredidation', 'urls:' ['https://data.world/iankelly/github-duplicate-repositories-with-source-repositories']})\n",
+    "coll.insert ( { 'topic':'Github', 'title': 'Github Issues', 'license': 'MIT', 'description': '8 million GitHub issue titles and description from 2017', 'urls:' ['https://inclass.kaggle.com/davidshinn/github-issues']})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'_id': ObjectId('5d7e6769e1eaff62f15de0d2'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['url1', 'url2']}\n",
+      "{'_id': ObjectId('5d7edc7c995d246453e1025b'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['url1', 'url2']}\n",
+      "{'_id': ObjectId('5d81827493569c62818e7979'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['url1', 'url2']}\n",
+      "{'_id': ObjectId('5d8193ebd4732fb4def4810e'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
+      "{'_id': ObjectId('5d81940ed4732fb4def48110'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
+      "{'_id': ObjectId('5d819429d4732fb4def48112'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'IMDB dataset',\n",
+      " 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
+      "{'_id': ObjectId('5d824e392d00ba0dda2b8223'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['url1', 'url2']}\n",
+      "{'_id': ObjectId('5d84e383e2fb382bffecc36a'),\n",
+      " 'description': 'The list of projects on github with the '\n",
+      "                'largest number of starts',\n",
+      " 'first dataset': 'largest projects',\n",
+      " 'license': 'NA',\n",
+      " 'topic': 'git URLs',\n",
+      " 'urls': ['url1', 'url2']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pprint\n",
+    "import pymongo, json\n",
+    "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n",
+    "db = client ['fdac19mp2']\n",
+    "coll = db ['apatel79']\n",
+    "pp = pprint.PrettyPrinter(indent=1,width=65)\n",
+    "for r in coll. find():\n",
+    "  print(pp .pformat (r))  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}