diff --git a/.vscode/launch.json b/.vscode/launch.json index 9e7a696f508..0e9af305bd6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -115,7 +115,8 @@ "--all", "--commit-to=snapshot/v1", "--commit-orgs-to=cfapi/orgs/v1", - // "--orgs-source=file://${workspaceFolder}/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json" + "--orgs-source=file://${workspaceFolder}/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json", + "--scratch-git-dir=/tmp/bpi-scratch.git" ], "skipFiles": [ "/**" diff --git a/crawler/lib/ScratchGit.js b/crawler/lib/ScratchGit.js new file mode 100644 index 00000000000..821180e88de --- /dev/null +++ b/crawler/lib/ScratchGit.js @@ -0,0 +1,96 @@ +const logger = require('winston'); +const tmp = require('tmp-promise'); +const gitUp = require('git-up'); +const exitHook = require('async-exit-hook'); +const { Repo } = require('hologit/lib'); +const { default: PQueue } = require('p-queue'); + +let cachedRepo = null; +let cachedGit = null; +const cachedFetchResults = new Map(); +const fetchQueue = new PQueue({ concurrency: 1 }); +const { GITHUB_ACTOR, GITHUB_TOKEN } = process.env; + +module.exports = class ScratchGit { + + /** + * Set the path of a persistent git directory to use as scratch space + * for all future operations + * @param {string} gitDir + */ + static async setPersistentGitDir (gitDir) { + // initialize hologit Repo instance + cachedRepo = new Repo({ + gitDir: gitDir, + ref: 'HEAD' + }); + cachedGit = await cachedRepo.getGit(); + + logger.info(`initializing persistent GIT_DIR=${gitDir}`); + await cachedGit.init({ bare: true }); + } + + static async getRepo () { + if (!cachedRepo) { + // create temporary git directory + const gitDir = await tmp.dir({ unsafeCleanup: true }); + exitHook(callback => gitDir.cleanup().then(callback)); + + // initialize hologit Repo instance + cachedRepo = new Repo({ + gitDir: gitDir.path, + ref: 'HEAD' + }); + cachedGit = await cachedRepo.getGit(); + + logger.info(`initializing temporary GIT_DIR=${gitDir.path}`); + await cachedGit.init({ bare: true }); + } + + return cachedRepo; + } + + static async getGit () { + if (!cachedGit) { + // will set cachedGit + await this.getRepo(); + } + + return cachedGit; + } + + /** + * Fetch the current commit hash for given url/ref + * @param {string} url + * @param {string=HEAD} ref + */ + static async fetchRemote (url, ref = 'HEAD') { + return fetchQueue.add(async () => { + const { resource, pathname } = gitUp(url); + const remoteKey = `${resource}${pathname.replace(/\.git$/i, '')}/${ref}`; + + if (cachedFetchResults.has(remoteKey)) { + return cachedFetchResults.get(remoteKey); + } + + let fetchUrl; + if (resource == 'github.com') { + fetchUrl = `https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com${pathname}.git`; + } else { + fetchUrl = url; + } + + const cacheRef = `refs/crawler-cache/${remoteKey}`; + const git = await this.getGit(); + + logger.debug(`fetching ${url}#${ref}`); + try { + const fetchResult = await git.fetch({ depth: 1, tags: false }, fetchUrl, `+${ref}:${cacheRef}`); + logger.silly(`fetched ${url}#${ref}: ${fetchResult}`); + return git.revParse({ verify: true }, cacheRef, { $nullOnError: true }); + } catch (err) { + logger.debug(`could not fetch ${url}#${ref}: ${err}`); + } + }); + } +}; diff --git a/crawler/lib/decorators/Git.js b/crawler/lib/decorators/Git.js new file mode 100644 index 00000000000..5a298843566 --- /dev/null +++ b/crawler/lib/decorators/Git.js @@ -0,0 +1,28 @@ +const logger = require('winston'); +const gitUp = require('git-up'); + +const ScratchGit = require('../ScratchGit.js'); + +/** + * Decorate a project with data about its git repository + * @class + */ +module.exports = class GitDecorator { + + static async canDecorate ({ code_url = null, github = null, git = null }) { + if (!code_url) { + return false; + } + + const { resource, pathname, protocols } = gitUp(code_url); + + return Boolean(resource && pathname && protocols.length && github !== false); + } + + static async decorate (projectData) { + // TODO: sniff ${url}/info/refs for HTTP to catch bad repos? ehh github doesn't like + const commitHash = await ScratchGit.fetchRemote(projectData.code_url); + + projectData.git = commitHash ? { head: commitHash } : false; + } +}; diff --git a/crawler/lib/decorators/PublicCode.js b/crawler/lib/decorators/PublicCode.js new file mode 100644 index 00000000000..664441795da --- /dev/null +++ b/crawler/lib/decorators/PublicCode.js @@ -0,0 +1,34 @@ +const logger = require('winston'); +const gitUp = require('git-up'); +const yaml = require('js-yaml'); + +const ScratchGit = require('../ScratchGit.js'); + +/** + * Decorate a project with publiccode.yml data from git + * @class + */ +module.exports = class PublicCodeDecorator { + + static async canDecorate ({ git = null }) { + return Boolean(git && git.head); + } + + static async decorate (projectData) { + const git = await ScratchGit.getGit(); + + let publicCodeData; + + const publicCodeString = await git.catFile({ p: true }, `${projectData.git.head}:publiccode.yml`, { $nullOnError: true }); + if (publicCodeString) { + try { + publicCodeData = yaml.safeLoad(publicCodeString); + } catch (err) { + logger.warn(`failed to load ${projectData.code_url}/publiccode.yml: ${err}`); + projectData['publiccode.yml'] = false; + } + } + + projectData['publiccode.yml'] = publicCodeData || false; + } +}; diff --git a/crawler/lib/decorators/__tests__/PublicCode.test.js b/crawler/lib/decorators/__tests__/PublicCode.test.js new file mode 100644 index 00000000000..236e3d3179e --- /dev/null +++ b/crawler/lib/decorators/__tests__/PublicCode.test.js @@ -0,0 +1,39 @@ +const PublicCode = require('../PublicCode.js'); + + +describe('Project Decorator: PublicCode', () => { + + test('populates publiccode', async () => { + const projectData = { + code_url: 'https://github.com/italia/18app' + }; + + expect(await PublicCode.canDecorate(projectData)).toBeTrue(); + + await PublicCode.decorate(projectData); + + expect(projectData['publiccode.yml']).toBeObject(); + expect(projectData['publiccode.yml']).toHaveProperty('url', 'https://github.com/italia/18app'); + expect(projectData['publiccode.yml']).toContainKeys([ + 'categories', + 'description', + 'developmentStatus', + 'intendedAudience', + 'it', + 'legal', + 'localisation', + 'logo', + 'maintenance', + 'name', + 'platforms', + 'publiccodeYmlVersion', + 'releaseDate', + 'softwareType', + 'url', + ]); + expect(projectData['publiccode.yml'].categories).toBeArray(); + expect(projectData['publiccode.yml'].categories).toEqual(['mobile-payment']); + + }); + +}); diff --git a/crawler/lib/decorators/index.js b/crawler/lib/decorators/index.js index 8f6fb87e004..1348b085652 100644 --- a/crawler/lib/decorators/index.js +++ b/crawler/lib/decorators/index.js @@ -1,5 +1,7 @@ const decoratorClasses = { GitHubRepository: require('./GitHubRepository.js'), + Git: require('./Git.js'), + // PublicCode: require('./PublicCode.js'), }; /** diff --git a/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json b/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json index 735217efdd8..38564b4271c 100644 --- a/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json +++ b/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json @@ -77,5 +77,22 @@ "Code for America", "Official" ] + }, + { + "name": "Code for Gainesville", + "website": "http://c4gnv.com", + "events_url": "http://www.meetup.com/Code-for-Gainesville/", + "rss": "", + "projects_list_url": "https://docs.google.com/spreadsheets/d/15DOB5ctu8aJEkaE1pjBIvRaZBG2wjbpJaUQUa6UL9bM/pub?output=csv", + "city": "Gainesville, FL", + "latitude": "29.651634", + "longitude": "-82.324826", + "type": "Brigade", + "social_profiles": { + "twitter": "@c4gnv" + }, + "tags": [ + "Brigade" + ] } ] \ No newline at end of file diff --git a/crawler/package-lock.json b/crawler/package-lock.json index a79df07864f..411e5c63c74 100644 --- a/crawler/package-lock.json +++ b/crawler/package-lock.json @@ -1501,7 +1501,6 @@ "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", - "dev": true, "requires": { "sprintf-js": "~1.0.2" } @@ -1569,6 +1568,11 @@ "resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.3.tgz", "integrity": "sha512-z/WhQ5FPySLdvREByI2vZiTWwCnF0moMJ1hK9YQwDTHKh6I7/uSckMetoRGb5UBZPC1z0jlw+n/XCgjeH7y1AQ==" }, + "async-exit-hook": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/async-exit-hook/-/async-exit-hook-2.0.1.tgz", + "integrity": "sha512-NW2cX8m1Q7KPA7a5M2ULQeZ2wR5qI5PAbw5L0UOMxdioVk9PMZ0h1TmyZEkPYrCvYjDlFICusOu1dlEKAAeXBw==" + }, "asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", @@ -2666,8 +2670,7 @@ "esprima": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", - "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", - "dev": true + "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==" }, "esquery": { "version": "1.1.0", @@ -2699,6 +2702,11 @@ "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", "dev": true }, + "eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==" + }, "exec-sh": { "version": "0.3.4", "resolved": "https://registry.npmjs.org/exec-sh/-/exec-sh-0.3.4.tgz", @@ -3636,6 +3644,33 @@ "semver": "^5.7.0" } }, + "git-up": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/git-up/-/git-up-4.0.5.tgz", + "integrity": "sha512-YUvVDg/vX3d0syBsk/CKUTib0srcQME0JyHkL5BaYdwLsiCslPWmDSi8PUMo9pXYjrryMcmsCoCgsTpSCJEQaA==", + "requires": { + "is-ssh": "^1.3.0", + "parse-url": "^6.0.0" + }, + "dependencies": { + "normalize-url": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz", + "integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==" + }, + "parse-url": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/parse-url/-/parse-url-6.0.0.tgz", + "integrity": "sha512-cYyojeX7yIIwuJzledIHeLUBVJ6COVLeT4eF+2P6aKVzwvgKQPndCBv3+yQ7pcWjqToYwaligxzSYNNmGoMAvw==", + "requires": { + "is-ssh": "^1.3.0", + "normalize-url": "^6.1.0", + "parse-path": "^4.0.0", + "protocols": "^1.4.0" + } + } + } + }, "gitsheets": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/gitsheets/-/gitsheets-0.2.2.tgz", @@ -6609,7 +6644,6 @@ "version": "3.13.1", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz", "integrity": "sha512-YfbcO7jXDdyj0DGxYVSlSeQNHbD7XPWvrVWeVUujrQEoZzWJIRrCPoyk6kL6IAjAG2IolMK4T0hNUe0HOUs5Jw==", - "dev": true, "requires": { "argparse": "^1.0.7", "esprima": "^4.0.0" @@ -7411,6 +7445,23 @@ "p-limit": "^2.0.0" } }, + "p-queue": { + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", + "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", + "requires": { + "eventemitter3": "^4.0.4", + "p-timeout": "^3.2.0" + } + }, + "p-timeout": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", + "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", + "requires": { + "p-finally": "^1.0.0" + } + }, "p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", @@ -8387,8 +8438,7 @@ "sprintf-js": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", - "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", - "dev": true + "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=" }, "sshpk": { "version": "1.16.1", diff --git a/crawler/package.json b/crawler/package.json index e4b41f400a4..166d5c6cef0 100644 --- a/crawler/package.json +++ b/crawler/package.json @@ -5,9 +5,13 @@ "author": "Chris Alfano ", "license": "Apache-2.0", "dependencies": { + "async-exit-hook": "^2.0.1", "axios": "^0.19.2", "csv-parser": "^2.3.3", + "git-up": "^4.0.1", "gitsheets": "^0.2.2", + "js-yaml": "^3.13.1", + "p-queue": "^6.3.0", "parse-link-header": "^1.0.1", "progress": "^2.0.3", "yargs": "^14.2.3" diff --git a/crawler/run.js b/crawler/run.js index 1df238949fa..62debbea547 100755 --- a/crawler/run.js +++ b/crawler/run.js @@ -50,6 +50,10 @@ require('yargs') describe: 'A target branch/ref to commit the imported orgs tree to', type: 'string' }, + 'scratch-git-dir': { + describe: 'Path to a directory to use as a persistent scratch git repository', + type: 'string' + }, debug: { describe: 'Enable more verbose output', type: 'boolean', @@ -72,6 +76,7 @@ require('yargs') commitOrgsTo, // run options + scratchGitDir = process.env.SCRATCH_GIT_DIR || null, debug, } = argv; @@ -87,6 +92,12 @@ require('yargs') })); + // configure scratch git dir + if (scratchGitDir) { + await require('./lib/ScratchGit.js').setPersistentGitDir(scratchGitDir); + } + + // prepare interfaces const sheets = await GitSheets.create(); const { repo, git } = sheets;