Skip to content

add Miniproject2 #55

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 0 additions & 102 deletions MongoDB.ipynb

This file was deleted.

161 changes: 161 additions & 0 deletions apatel79.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-1-6d62dde7c1a8>, line 12)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-1-6d62dde7c1a8>\"\u001b[0;36m, line \u001b[0;32m12\u001b[0m\n\u001b[0;31m coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackSample', 'license': 'CC w attribution', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacksample']})\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"import pprint\n",
"import re\n",
"import pymongo, json\n",
"\n",
"pp = pprint.PrettyPrinter(indent=1,width=65)\n",
"\n",
"client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n",
"db = client ['fdac19mp2']\n",
"coll = db ['apatel79']\n",
"# for each dataset\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stackoverflow Data', 'license': 'CC', 'description': 'Updated on a quarterly basis, this BigQuery dataset includes an archive of Stack Overflow content, including posts, votes, tags, and badges. This dataset is updated to mirror the Stack Overflow content on the Internet Archive, and is also available through the Stack Exchange Data Explorer.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stackoverflow' ] } )\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackSample', 'license': 'CC w attribution', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacksample']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'R Questions from Stackoverflow', 'license': 'CC w attribution', 'description': 'Full text of questions and answers from Stack Overflow that are tagged with the r tag, useful for natural language processing and community analysis.', 'urls:' ['https://www.kaggle.com/stackoverflow/rquestions']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Python Questions from Stackoverflow', 'license': 'CC w attribution', 'description': 'Full text of all questions and answers from Stack Overflow that are tagged with the python tag. Useful for natural language processing and community analysis. See also the dataset of R questions.', 'urls:' ['https://www.kaggle.com/stackoverflow/pythonquestions']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'StackLite', 'license': 'Open Database', 'description': 'A dataset of Stack Overflow programming questions.', 'urls:' ['https://www.kaggle.com/stackoverflow/stacklite']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stackoverflow Survey Results', 'license': 'Open Database', 'description': 'A dataset of favorite technologies to their job preference.', 'urls:' ['https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey']})\n",
"coll.insert ( { 'topic':'Pylint', 'title': 'Pylint Results for Python Code Snippets on Stack Overflow', 'license': 'Open Database', 'description': 'This dataset contains valid pylint results for all Stack Overflow code snippets from SOTorrent', 'urls:' ['https://zenodo.org/record/2558544#.XZdhGvcpAUs']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Developer Survey, 2017', 'license': 'Open Database', 'description': 'This dataset contains survey information like programming languages, salary, code style and various other information.', 'urls:' ['https://www.kaggle.com/stackoverflow/so-survey-2017']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Tag Network', 'license': 'CC BY-SA 3.0', 'description': 'stack_network_links contains links of the network, the source and target tech tags plus the value of the the link between each pair stack_network_nodes contains nodes of the network, the name of each node, which group that node belongs to (calculated via a cluster walktrap), and a node size based on how often that technology tag is used', 'urls:' ['https://www.kaggle.com/stackoverflow/stack-overflow-tag-network']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Contextual Documentation Referencing on Stack Overflow', 'license': 'CC BY-SA 4.0', 'description': 'This dataset contains contextual information in regards to documentation.', 'urls:' ['https://zenodo.org/record/2556643#.XZdh5vcpAUs']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'R and Python Stack Overflow Answers + Sentiment', 'license': 'Open Database', 'description': 'Stack Overflow answers by the Top 10 r and python users extracted using BigQuery. Also includes data on whether the answer was accepted and some additional data based on sentiment analysis of the answer text.', 'urls:' ['https://www.kaggle.com/ojwatson/stack-overflow-output']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'SOTorrent: Reconstructing and Analyzing the Evolution of Stack Overflow Posts', 'license': 'Open Database', 'description': 'SOTorrent provides access to the version history of Stack Overflow content at the level of whole posts and individual text or code blocks.', 'urls:' ['https://zenodo.org/record/1201554#.XZdjYvcpAUs']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Dataset with manually validated version histories of Stack Overflow posts', 'license': 'Open Database', 'description': 'This dataset contains information in regard to SOTorrent, specifically information regarding the versioning history.', 'urls:' ['https://zenodo.org/record/1477110#.XZdjafcpAUs']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Developer Survey Results 2019', 'license': 'CC BY-SA 4.0', 'description': 'This dataset contains survey results from 2019 regarding developers\\'s favorite technologies to their job preference.', 'urls:' ['https://www.kaggle.com/mchirico/stack-overflow-developer-survey-results-2019?source=post_page-----e0ae8335fdc4----------------------']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Questions from Cross Validated Stack Exchange', 'license': 'CC BY-SA 3.0', 'description': 'Full text of questions and answers from Cross Validated, the statistics and machine learning Q&A site from the Stack Exchange network.', 'urls:' ['https://www.kaggle.com/stackoverflow/statsquestions']})\n",
"coll.insert ( { 'topic':'Stackoverflow', 'title': 'Stack Overflow Tag Prediction', 'license': 'CC0: Public Domain', 'description': 'Data that may be useful for NLP - it contains titles and tags', 'urls:' ['https://www.kaggle.com/badalgupta/stack-overflow-tag-prediction']})\n",
"coll.insert ( { 'topic':'Github', 'title': '452M commits on GitHub', 'license': 'CC BY-NC', 'description': 'Github commit data that contains repository name, git hash, author email, date/time, and commit message.', 'urls:' ['https://data.world/vmarkovtsev/452-m-commits-on-github']})\n",
"coll.insert ( { 'topic':'Github', 'title': 'Code review regression analysis of open source GitHub projects', 'license': 'CC BY-SA 4.0', 'description': 'This dataset was collected from GitHub, and includes 3,126 projects in 143 languages, with 489,038 issues and 382,771 pull requests.', 'urls:' ['https://datadryad.org/stash/dataset/doi:10.6078/D14X0T']})\n",
"coll.insert ( { 'topic':'Github', 'title': 'Top Starred Open Source Projects on GitHub', 'license': 'Public Domain', 'description': 'Github commit data that contains repository data - i.e. project popularity', 'urls:' ['https://data.world/chasewillden/top-starred-open-source-projects-on-github']})\n",
"coll.insert ( { 'topic':'Github', 'title': 'GitHub Duplicate Repositories with Source Repositories', 'license': 'Public Domain', 'description': 'Dataset that flags duplicate repositories and has attached source repositories - possible candidate for errors / miscredidation', 'urls:' ['https://data.world/iankelly/github-duplicate-repositories-with-source-repositories']})\n",
"coll.insert ( { 'topic':'Github', 'title': 'Github Issues', 'license': 'MIT', 'description': '8 million GitHub issue titles and description from 2017', 'urls:' ['https://inclass.kaggle.com/davidshinn/github-issues']})\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': ObjectId('5d7e6769e1eaff62f15de0d2'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['url1', 'url2']}\n",
"{'_id': ObjectId('5d7edc7c995d246453e1025b'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['url1', 'url2']}\n",
"{'_id': ObjectId('5d81827493569c62818e7979'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['url1', 'url2']}\n",
"{'_id': ObjectId('5d8193ebd4732fb4def4810e'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
"{'_id': ObjectId('5d81940ed4732fb4def48110'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
"{'_id': ObjectId('5d819429d4732fb4def48112'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'IMDB dataset',\n",
" 'urls': ['https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv']}\n",
"{'_id': ObjectId('5d824e392d00ba0dda2b8223'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['url1', 'url2']}\n",
"{'_id': ObjectId('5d84e383e2fb382bffecc36a'),\n",
" 'description': 'The list of projects on github with the '\n",
" 'largest number of starts',\n",
" 'first dataset': 'largest projects',\n",
" 'license': 'NA',\n",
" 'topic': 'git URLs',\n",
" 'urls': ['url1', 'url2']}\n"
]
}
],
"source": [
"import pprint\n",
"import pymongo, json\n",
"client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n",
"db = client ['fdac19mp2']\n",
"coll = db ['apatel79']\n",
"pp = pprint.PrettyPrinter(indent=1,width=65)\n",
"for r in coll. find():\n",
" print(pp .pformat (r)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}