Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@
"--all",
"--commit-to=snapshot/v1",
"--commit-orgs-to=cfapi/orgs/v1",
// "--orgs-source=file://${workspaceFolder}/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json"
"--orgs-source=file://${workspaceFolder}/crawler/lib/repositories/organizations/__fixtures__/test-organizations.json",
"--scratch-git-dir=/tmp/bpi-scratch.git"
],
"skipFiles": [
"<node_internals>/**"
Expand Down
96 changes: 96 additions & 0 deletions crawler/lib/ScratchGit.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
const logger = require('winston');
const tmp = require('tmp-promise');
const gitUp = require('git-up');
const exitHook = require('async-exit-hook');
const { Repo } = require('hologit/lib');
const { default: PQueue } = require('p-queue');

let cachedRepo = null;
let cachedGit = null;
const cachedFetchResults = new Map();
const fetchQueue = new PQueue({ concurrency: 1 });
const { GITHUB_ACTOR, GITHUB_TOKEN } = process.env;

module.exports = class ScratchGit {

/**
* Set the path of a persistent git directory to use as scratch space
* for all future operations
* @param {string} gitDir
*/
static async setPersistentGitDir (gitDir) {
// initialize hologit Repo instance
cachedRepo = new Repo({
gitDir: gitDir,
ref: 'HEAD'
});
cachedGit = await cachedRepo.getGit();

logger.info(`initializing persistent GIT_DIR=${gitDir}`);
await cachedGit.init({ bare: true });
}

static async getRepo () {
if (!cachedRepo) {
// create temporary git directory
const gitDir = await tmp.dir({ unsafeCleanup: true });
exitHook(callback => gitDir.cleanup().then(callback));

// initialize hologit Repo instance
cachedRepo = new Repo({
gitDir: gitDir.path,
ref: 'HEAD'
});
cachedGit = await cachedRepo.getGit();

logger.info(`initializing temporary GIT_DIR=${gitDir.path}`);
await cachedGit.init({ bare: true });
}

return cachedRepo;
}

static async getGit () {
if (!cachedGit) {
// will set cachedGit
await this.getRepo();
}

return cachedGit;
}

/**
* Fetch the current commit hash for given url/ref
* @param {string} url
* @param {string=HEAD} ref
*/
static async fetchRemote (url, ref = 'HEAD') {
return fetchQueue.add(async () => {
const { resource, pathname } = gitUp(url);
const remoteKey = `${resource}${pathname.replace(/\.git$/i, '')}/${ref}`;

if (cachedFetchResults.has(remoteKey)) {
return cachedFetchResults.get(remoteKey);
}

let fetchUrl;
if (resource == 'github.com') {
fetchUrl = `https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com${pathname}.git`;
} else {
fetchUrl = url;
}

const cacheRef = `refs/crawler-cache/${remoteKey}`;
const git = await this.getGit();

logger.debug(`fetching ${url}#${ref}`);
try {
const fetchResult = await git.fetch({ depth: 1, tags: false }, fetchUrl, `+${ref}:${cacheRef}`);
logger.silly(`fetched ${url}#${ref}: ${fetchResult}`);
return git.revParse({ verify: true }, cacheRef, { $nullOnError: true });
} catch (err) {
logger.debug(`could not fetch ${url}#${ref}: ${err}`);
}
});
}
};
28 changes: 28 additions & 0 deletions crawler/lib/decorators/Git.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
const logger = require('winston');
const gitUp = require('git-up');

const ScratchGit = require('../ScratchGit.js');

/**
* Decorate a project with data about its git repository
* @class
*/
module.exports = class GitDecorator {

static async canDecorate ({ code_url = null, github = null, git = null }) {
if (!code_url) {
return false;
}

const { resource, pathname, protocols } = gitUp(code_url);

return Boolean(resource && pathname && protocols.length && github !== false);
}

static async decorate (projectData) {
// TODO: sniff ${url}/info/refs for HTTP to catch bad repos? ehh github doesn't like
const commitHash = await ScratchGit.fetchRemote(projectData.code_url);

projectData.git = commitHash ? { head: commitHash } : false;
}
};
34 changes: 34 additions & 0 deletions crawler/lib/decorators/PublicCode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
const logger = require('winston');
const gitUp = require('git-up');
const yaml = require('js-yaml');

const ScratchGit = require('../ScratchGit.js');

/**
* Decorate a project with publiccode.yml data from git
* @class
*/
module.exports = class PublicCodeDecorator {

static async canDecorate ({ git = null }) {
return Boolean(git && git.head);
}

static async decorate (projectData) {
const git = await ScratchGit.getGit();

let publicCodeData;

const publicCodeString = await git.catFile({ p: true }, `${projectData.git.head}:publiccode.yml`, { $nullOnError: true });
if (publicCodeString) {
try {
publicCodeData = yaml.safeLoad(publicCodeString);
} catch (err) {
logger.warn(`failed to load ${projectData.code_url}/publiccode.yml: ${err}`);
projectData['publiccode.yml'] = false;
}
}

projectData['publiccode.yml'] = publicCodeData || false;
}
};
39 changes: 39 additions & 0 deletions crawler/lib/decorators/__tests__/PublicCode.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
const PublicCode = require('../PublicCode.js');


describe('Project Decorator: PublicCode', () => {

test('populates publiccode', async () => {
const projectData = {
code_url: 'https://github.com/italia/18app'
};

expect(await PublicCode.canDecorate(projectData)).toBeTrue();

await PublicCode.decorate(projectData);

expect(projectData['publiccode.yml']).toBeObject();
expect(projectData['publiccode.yml']).toHaveProperty('url', 'https://github.com/italia/18app');
expect(projectData['publiccode.yml']).toContainKeys([
'categories',
'description',
'developmentStatus',
'intendedAudience',
'it',
'legal',
'localisation',
'logo',
'maintenance',
'name',
'platforms',
'publiccodeYmlVersion',
'releaseDate',
'softwareType',
'url',
]);
expect(projectData['publiccode.yml'].categories).toBeArray();
expect(projectData['publiccode.yml'].categories).toEqual(['mobile-payment']);

});

});
2 changes: 2 additions & 0 deletions crawler/lib/decorators/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
const decoratorClasses = {
GitHubRepository: require('./GitHubRepository.js'),
Git: require('./Git.js'),
// PublicCode: require('./PublicCode.js'),
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,22 @@
"Code for America",
"Official"
]
},
{
"name": "Code for Gainesville",
"website": "http://c4gnv.com",
"events_url": "http://www.meetup.com/Code-for-Gainesville/",
"rss": "",
"projects_list_url": "https://docs.google.com/spreadsheets/d/15DOB5ctu8aJEkaE1pjBIvRaZBG2wjbpJaUQUa6UL9bM/pub?output=csv",
"city": "Gainesville, FL",
"latitude": "29.651634",
"longitude": "-82.324826",
"type": "Brigade",
"social_profiles": {
"twitter": "@c4gnv"
},
"tags": [
"Brigade"
]
}
]
62 changes: 56 additions & 6 deletions crawler/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
"author": "Chris Alfano <[email protected]>",
"license": "Apache-2.0",
"dependencies": {
"async-exit-hook": "^2.0.1",
"axios": "^0.19.2",
"csv-parser": "^2.3.3",
"git-up": "^4.0.1",
"gitsheets": "^0.2.2",
"js-yaml": "^3.13.1",
"p-queue": "^6.3.0",
"parse-link-header": "^1.0.1",
"progress": "^2.0.3",
"yargs": "^14.2.3"
Expand Down
Loading