Skip to content

Optimize GitHub API usage by leveraging search response data for repository information [fixes #11] #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 69 additions & 143 deletions projects/initial-data/main.js
Original file line number Diff line number Diff line change
@@ -1,147 +1,75 @@
import { Octokit } from 'octokit';
import { getInput } from './setup.js';

import { DataRecorder } from './dataRecorder.js';

const CSV_FILE_NAME = `initialTopicRepoData-${Date.now()}.csv`;

export async function fetchRepoCreationDate(octokit, owner, repo) {
console.log(`Fetching creation date for repository: ${owner}/${repo}`);
const response = await octokit.request('GET /repos/{owner}/{repo}', {
owner,
repo,
});
return Date.parse(response.data.created_at);
}

async function fetchFirstCommitDate(octokit, owner, repo) {
console.log(`Fetching first commit date for repository: ${owner}/${repo}`);
try {
const response = await octokit.request(
'GET /repos/{owner}/{repo}/commits',
{
owner,
repo,
per_page: 1,
},
);

const lastPageUrl = response.headers.link?.match(
/<([^>]+)>;\s*rel="last"/,
)?.[1];

if (!lastPageUrl) {
if (response.data.length > 0) {
response.data[0].commit.author.date;
} else {
throw new Error(`No commits found for ${owner}/${repo}`);
}
}

const lastPageResponse = await octokit.request(lastPageUrl);

if (lastPageResponse.data.length > 0) {
return Date.parse(lastPageResponse.data[0].commit.author.date);
} else {
throw new Error(`No commits found ${owner}/${repo}`);
}
} catch (err) {
throw new Error(`Could not find any commits for ${owner}/${repo}`);
}
}

async function fetchRepoTopics(octokit, owner, repo) {
console.log(`Fetching topics for repository: ${owner}/${repo}`);
const response = await octokit.request('GET /repos/{owner}/{repo}/topics', {
owner,
repo,
});
return response.data.names;
}

async function fetchFirstReleaseDate(octokit, owner, repo) {
console.log(`Fetching first release date for repository: ${owner}/${repo}`);
try {
const response = await octokit.request(
'GET /repos/{owner}/{repo}/releases',
{
owner,
repo,
per_page: 1,
},
);
const lastPageUrl = response.headers.link?.match(
/<([^>]+)>;\s*rel="last"/,
)?.[1];

if (!lastPageUrl) {
if (response.data.length > 0) {
response.data[0].created_at;
} else {
throw new Error(`No releases found for ${owner}/${repo}`);
}
}

const lastPageResponse = await octokit.request(lastPageUrl);
if (lastPageResponse.data.length > 0) {
return Date.parse(lastPageResponse.data[0].created_at);
} else {
throw new Error(`No releases found for ${owner}/${repo}`);
async function fetchRepositoriesWithTopic(octokit, topic, numRepos) {
console.log(`Fetching repositories with topic: ${topic}`);

const perPage = 100; //allow max 100 items/page
const maxResults = numRepos === -1 ? 1000 : Math.min(numRepos, 1000); //Cap at 1000
const pages = Math.ceil(maxResults / perPage);

let allRepos = [];

for (let page = 1; page <= pages; page++) {
console.log(`Fetching page ${page} of ${pages} for topic: ${topic}`);

const response = await octokit.rest.search.repos({
q: `topic:${topic}`,
per_page: perPage,
page: page,
sort: 'updated',
order: 'desc'
});

const repos = response.data.items;
allRepos = [...allRepos, ...repos];

// If received fewer items than requested, we've reached the end
if (repos.length < perPage) break;

// If reached our target number of repos, stop
if (numRepos !== -1 && allRepos.length >= numRepos) {
allRepos = allRepos.slice(0, numRepos);
break;
}
} catch (err) {
throw new Error(`Unable to get releases for ${owner}/${repo}`);

// Rate limits
await new Promise(resolve => setTimeout(resolve, 1000));
}

return allRepos;
}

export async function processRepository(octokit, owner, repo) {
console.log(`Processing repository: ${owner}/${repo}`);
const githubRepoURL = `https://github.com/${owner}/${repo}`;

const creationDate = await fetchRepoCreationDate(octokit, owner, repo);
let firstReleaseDate;
try {
firstReleaseDate = await fetchFirstReleaseDate(octokit, owner, repo);
} catch (err) {
throw new Error(`Unable to get releases for ${owner}/${repo}`);
}
const repoTopics = await fetchRepoTopics(octokit, owner, repo);

let firstCommitDate;
try {
firstCommitDate = await fetchFirstCommitDate(octokit, owner, repo);
} catch (err) {
throw new Error(`Error trying to find first commit for ${owner}/${repo}`);
}
console.log({ firstReleaseDate });

if (firstReleaseDate === null) {
console.log(`First release date: of ${githubRepoURL} unknown`);
}

if (firstCommitDate === null) {
console.log(`First commit date: of ${githubRepoURL} unknown`);
}

function extractRepositoryData(repo) {
console.log(`Extracting data for repository: ${repo.full_name}`);

const creationDate = Date.parse(repo.created_at);

// Topics are directly available in the search response
const repoTopics = repo.topics || [];

const singleRowData = {
repository: `${owner}/${repo}`,
repository: repo.full_name,
repoTopics: `"${repoTopics.join(', ')}"`,
date_first_commit: firstCommitDate,
date_first_commit: null, // skipping this as per requirements
creation: creationDate,
date_first_release: firstReleaseDate, // firstReleaseDate is null if no releases. allowed it because it is appropriate
date_first_release: null, // skipping this as per requirements
};

return singleRowData;
}

async function main(token, topic, numRepos) {
const octokit = new Octokit({ auth: token });

const iterator = octokit.paginate.iterator(octokit.rest.search.repos, {
q: `topic:${topic}`,
per_page: 100,
});
let processedRepos = 0;


// Fetch repositories with the specified topic
const repositories = await fetchRepositoriesWithTopic(octokit, topic, numRepos);
console.log(`Found ${repositories.length} repositories with topic: ${topic}`);

// Set up CSV recorder
const csvColumns = [
'repo',
'repo_topics',
Expand All @@ -150,26 +78,21 @@ async function main(token, topic, numRepos) {
'date_first_release',
];
const dataRecorder = new DataRecorder(`./data/${CSV_FILE_NAME}`, csvColumns);

for await (const iteration of iterator) {
const data = iteration.data;
for (const repo of data) {
try {
if (numRepos !== -1 && processedRepos >= numRepos) break;
const dataRow = await processRepository(
octokit,
repo.owner.login,
repo.name,
);

dataRecorder.appendToCSV(Object.values(dataRow));
processedRepos++;
console.log(`processed ${processedRepos}`);
} catch (err) {
console.error(err);
}

// Process each repository
let processedRepos = 0;
for (const repo of repositories) {
try {
const dataRow = extractRepositoryData(repo);
dataRecorder.appendToCSV(Object.values(dataRow));
processedRepos++;
console.log(`Processed ${processedRepos}/${repositories.length}`);
} catch (err) {
console.error(`Error processing repository ${repo.full_name}:`, err);
}
}

console.log(`Completed processing ${processedRepos} repositories`);
}

export function runMain() {
Expand All @@ -180,3 +103,6 @@ export function runMain() {

main(token, topic, numRepos);
}

// testing purposes
export { fetchRepositoriesWithTopic, extractRepositoryData };
27 changes: 22 additions & 5 deletions projects/initial-data/processRespository.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { tmpdir } from 'node:os';
import { sep } from 'node:path';
import fs from 'fs';

import { fetchRepoCreationDate } from './main.js';
import { fetchRepositoriesWithTopic, extractRepositoryData } from './main.js';
import { getInput } from './setup.js';
import { Octokit } from 'octokit';
import { DataRecorder } from './dataRecorder.js';
Expand Down Expand Up @@ -44,10 +44,27 @@ describe('Basic tests', () => {
});
});

it('Simple use of Octokit calls the GitHub API', async () => {
let data;
data = await fetchRepoCreationDate(octokit, 'octocat', 'hello-world');
expect(data).toEqual(1550934514000);
it('fetchRepositoriesWithTopic fetches repositories with the specified topic', async () => {
const repos = await fetchRepositoriesWithTopic(octokit, 'json-schema', 3);
expect(repos.length).toBeGreaterThan(0);
expect(repos.length).toBeLessThanOrEqual(3);
expect(repos[0].topics).toContain('json-schema');
});

it('extractRepositoryData correctly extracts data from a repository', () => {
const mockRepo = {
full_name: 'owner/repo',
created_at: '2020-01-01T00:00:00Z',
topics: ['json-schema', 'validation']
};

const data = extractRepositoryData(mockRepo);

expect(data.repository).toBe('owner/repo');
expect(data.repoTopics).toBe('"json-schema, validation"');
expect(data.creation).toBe(Date.parse('2020-01-01T00:00:00Z'));
expect(data.date_first_commit).toBeNull();
expect(data.date_first_release).toBeNull();
});

it('DataRecorder writes JSON data to csv file', async () => {
Expand Down