Skip to content

Commit a56df9d

Browse files
authored
Merge pull request #10 from isselab/cleanup
Cleanup
2 parents ecbeb04 + a8146a9 commit a56df9d

12 files changed

+103
-46
lines changed

AstToEcoreConverter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,7 +1099,7 @@ def get_method_def_from_internal_structure(self, method_name, module):
10991099
return current_method[0]
11001100
return None
11011101

1102-
def create_method_signature(self, method_node, name, arguments, return_type = None):
1102+
def create_method_signature(self, method_node, name, arguments, return_type=None):
11031103
"""
11041104
Creates a method signature for a method definition.
11051105
@@ -1134,7 +1134,7 @@ def create_method_signature(self, method_node, name, arguments, return_type = No
11341134

11351135
# Add type for TParameter.type
11361136
parameter_type = self.create_ecore_instance(NodeTypes.CLASS)
1137-
#parameter_type.tName = arg.annotation if arg.annotation else 'None'
1137+
# parameter_type.tName = arg.annotation if arg.annotation else 'None'
11381138
parameter.type = parameter_type
11391139

11401140
method_node.signature = method_signature

CustomDataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \
1010
convert_hashed_names_to_float
1111
from Encoder import multi_hot_encoding
12-
from GraphClasses import defined_labels
12+
from settings import CONFIG
1313

1414

1515
class RepositoryDataset(Dataset):
@@ -30,7 +30,8 @@ def __init__(self, directory, label_list=None):
3030
print(e)
3131
# nodes have 11 features, their one hot encoded node type, hashed name, and one hot encoded library flag
3232
self.num_node_features = 11
33-
self.num_classes = len(defined_labels)
33+
self.defined_labels = CONFIG['graph']['defined_labels']
34+
self.num_classes = len(self.defined_labels)
3435
self.directory = directory
3536
self.graph_names = []
3637
self.graph_dir = os.listdir(directory)
@@ -162,7 +163,7 @@ def convert_labeled_graphs(self, labels):
162163
graph_labels) # count how many repos are in each class
163164

164165
# encode labels
165-
encoded_nodes = multi_hot_encoding(defined_labels, graph_labels)
166+
encoded_nodes = multi_hot_encoding(self.defined_labels, graph_labels)
166167
file = zip(graph_names, encoded_nodes)
167168
return file
168169

EcoreToMatrixConverter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,8 @@ def convert_subpackages_recursive(self, t_package):
335335
t_package: The package to convert subpackages from.
336336
"""
337337
for t_subpackage in t_package.subpackages:
338-
current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, t_package.tName,
338+
current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value,
339+
t_package.tName,
339340
NodeTypes.PACKAGE.value)
340341
if current_subpackage is None:
341342
self.node_matrix.append(NodeTypes.PACKAGE.value)

GCN.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
'''defines the architecture of the graph convolutional network'''
66

7+
78
class GCN(torch.nn.Module):
89
def __init__(self, num_node_features, num_classes, hidden_channels):
910
super(GCN, self).__init__()
@@ -33,4 +34,4 @@ def forward(self, x, edge_index, edge_attr, batch=None):
3334
# sigmoid activation function for multi-label
3435
x = f.sigmoid(x)
3536

36-
return x
37+
return x

GraphClasses.py

Lines changed: 0 additions & 3 deletions
This file was deleted.

NodeFeatures.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,20 @@ class NodeTypes(Enum):
1717
CLASS = "TClass"
1818
# TMethod
1919
METHOD = "TMethod"
20-
METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented.
21-
METHOD_DEFINITION = "TMethodDefinition"# missing "".overloading and "".overloadedBY does not need to be implemented.
20+
METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented.
21+
METHOD_DEFINITION = "TMethodDefinition" # missing "".overloading and "".overloadedBY does not need to be implemented.
2222
PARAMETER = "TParameter"
2323
# TField
2424
FIELD = "TField"
25-
FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
26-
FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
25+
FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
26+
FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
2727
# TAccess
2828
CALL = "TCall"
2929
READ = "TRead" # Todo implement this in AstToEcoreConverter
3030
WRITE = "TWrite" # Todo implement this in AstToEcoreConverter
3131
READ_WRITE = "TReadWrite" # Todo implement this in AstToEcoreConverter
32-
#TInterface
32+
# TInterface
3333
INTERFACE = "TInterface"
3434
# In Python, there is no formal concept of interfaces as found in some other programming languages like Java or C#.
3535
# However, Python supports a similar concept through the use of abstract base classes (ABCs) and duck typing.
36-
# The return on investment probably is not sufficient to justify the implementation.
36+
# The return on investment probably is not sufficient to justify the implementation.

Pipeline.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,6 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list
239239
global repo_multiprocess, ecore_graph
240240
global node_features, adj_list, edge_attribute
241241

242-
243242
# clone repositories for the dataset
244243
if repository_list is not None:
245244
download_repositories(repository_directory, repository_list)

README.md

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,70 @@
1-
# github-classifier
1+
# Classifier for GitHub Repos
22

3-
**short description**
3+
## Table of Contents
4+
- [Intro](#intro)
5+
- [Installation for Users](#installation-instruction-for-users)
6+
- [Installation for Devs](#installation-instruction-for-devs)
7+
- [Expectation for Devs](#expectation-for-devs)
8+
- [Known Problems / Limitations](#known-problems--limitations)
9+
- [Help](#help)
410

5-
This repository contains a deep-learning based classification tool for software repositories. The tool utilizes the ecore metamodel 'type graph' and a graph convolutional network. To use the tool, run 'main.py' after adding the directory containing the repositories you want to classify.
11+
## Intro:
612

7-
If you want to train the tool with different labels, replace the current labels with your own (or add them to the labels) in GraphClasses.py, and in function 'multi_hot_encoding' in Encoder.py. Optionally also in function 'count_class_elements' in CustomDataset.py if you want to know the number of samples in each class in your dataset.
8-
The labels in the tool are not mutually exclusive and are multi-hot encoded.
13+
This repository features a deep learning classifier designed for the analysis of software repositories.
14+
The tool employs the ecore metamodel's 'type graph' in conjunction with a graph convolutional network.
15+
Presently, the classifier categorizes repositories into four distinct classes: Application, Framework, Library, and Plugin.
16+
It is important to note that the labels utilized by the tool are **not mutually exclusive** and are represented in a multi-hot encoded format.
917

10-
Currently, the tool only processes Python files.
18+
## Installation Instruction for Users:
19+
1. Clone the repository by executing the following command:
20+
`git clone https://github.com/isselab/github-classifier.git`
21+
2. Open the cloned repository using your preferred Integrated Development Environment (IDE).
22+
For the purposes of this instruction, we will assume the use of PyCharm from JetBrains.
23+
3. Change the directory to data/input by running the following command:
24+
`cd ~/data/input`
25+
4. Clone the repositories you wish to analyze by executing:
26+
`git clone LINK_TO_REPO_YOU_WANT`
27+
5. run main.py
1128

12-
**labels**
29+
The default threshold for identification is set at 50%.
30+
If you wish to modify this threshold, please locate the relevant settings in the settings.py file.
31+
After making the necessary adjustments, ensure to rerun main.py to apply the changes.
1332

14-
Application, Framework, Library, Plugin
33+
## Installation Instruction for Devs:
1534

16-
**data**
35+
### Basic Installation:
36+
1. Clone the repository by executing the following command:
37+
`git clone https://github.com/isselab/github-classifier.git`
38+
2. Open the cloned repository using your preferred Integrated Development Environment (IDE).
1739

18-
Dataset with Python software repositories from GitHub, all with a dependency on at least one ML library.
19-
The labeled repositories the tool is trained with are in data/labeled_dataset_repos.xlsx.
40+
### Retraining:
41+
1. Check data/labeled_dataset_repos.xlsx.
42+
This xlsx file contains the labeled repository's the tool is trained with.
43+
You may want to change it accordingly to your needs.
44+
2. We strongly recommend utilizing a GPU for training purposes.
45+
To verify GPU availability, please run the TorchGPUCheck.py script.
46+
If you get the Result "Cuda is available!" you may proceed to step 3.
47+
If the output indicates that "Cuda is not available," please follow the instructions provided in the terminal.
48+
Additionally, refer to the guide in the [Help](#help) section for further assistance in resolving any issues.
49+
3. Run prepareDataset.py
50+
4. Change the experiment_name in settings.py in the training section.
51+
5. Run training.py
2052

21-
**requirements**
2253

23-
pyecore~=0.14.0 or higher versions
54+
## Expectation for Devs:
55+
### Recommended Workflow:
56+
1. Create an issue in the GitHub issue page.
57+
2. Open a branch named after the issue
58+
3. Write code that fixes the issue
59+
4. Write test code to be sure it works.
60+
5. Comment your code well to be sure it can be understood.
61+
6. Create a merge request
2462

25-
autopep8
63+
## Known Problems / Limitations:
64+
- The Tool only processes Python files.
65+
- Dataset contains Python software repositories from GitHub, all with a dependency on at least one ML library.
66+
- Labels can not be changed easily, WIP
2667

27-
GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file" for instructions on how to install the tool
68+
## Help
69+
- Torch CUDA Guide, see "https://www.geeksforgeeks.org/how-to-set-up-and-run-cuda-operations-in-pytorch/"
70+
- GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file"

TorchGPUCheck.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import torch
2+
3+
"""
4+
This code is a simple Python script that checks if CUDA is available on the system and provides instructions on how to enable it if it's not available.
5+
"""
6+
7+
if __name__ == "__main__":
8+
print(torch.torch_version)
9+
# Check if CUDA is available
10+
if torch.cuda.is_available():
11+
print("CUDA is available!")
12+
print(f"Number of GPUs: {torch.cuda.device_count()}")
13+
print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
14+
else:
15+
print("CUDA is not available.")
16+
print("To enable CUDA, follow these steps:")
17+
print("1. **Install NVIDIA Drivers**: Ensure you have the latest NVIDIA drivers installed on your system.")
18+
print(
19+
"2. **Install CUDA Toolkit**: Download and install the CUDA Toolkit from the official NVIDIA website: https://developer.nvidia.com/cuda-downloads")
20+
print(
21+
"3. **Verify CUDA Installation**: After installation, verify that CUDA is working correctly by running the `nvidia-smi` command in your terminal/command prompt.")
22+
print(
23+
"4. **Update PyTorch**: Make sure you're using the latest version of PyTorch. You can update PyTorch using pip: `pip install --upgrade torch`")
24+
print("5. **Restart Your System**: Restart your system to ensure that the changes take effect.")

pep8autoformat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import autopep8
22

3+
34
def format_python_file(path_to_file):
45
try:
56
# Read the current content of the file
@@ -21,4 +22,4 @@ def format_python_file(path_to_file):
2122
if __name__ == "__main__":
2223
# Specify the file path you want to format
2324
file_path = 'AstToEcoreConverter.py'
24-
format_python_file(file_path)
25+
format_python_file(file_path)

torch_gpu_check.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
from CustomDataset import RepositoryDataset
1313
from GCN import GCN
14-
from GraphClasses import defined_labels
1514
from settings import CONFIG
1615

1716
'''please prepare the dataset you want to train the tool with by using prepareDataset.py,
@@ -27,14 +26,15 @@
2726
threshold = CONFIG['training']['threshold']
2827
save_classification_reports = CONFIG['training']['save_classification_reports']
2928
experiment_name = CONFIG['training']['experiment_name']
29+
defined_labels = CONFIG['graph']['defined_labels']
3030

3131

3232
def train():
3333
model.train()
3434

3535
num_classes = int(len(defined_labels))
3636

37-
for graph in tqdm(trainloader,desc = "Training"):
37+
for graph in tqdm(trainloader, desc="Training"):
3838

3939
if device == 'cuda':
4040
graph.x = graph.x.to(device)
@@ -67,7 +67,7 @@ def test(loader):
6767
total = 0
6868
num_classes = int(len(defined_labels))
6969

70-
for graph in tqdm(loader,desc = "Testing"):
70+
for graph in tqdm(loader, desc="Testing"):
7171

7272
if device == 'cuda':
7373
graph.x = graph.x.to(device)

0 commit comments

Comments
 (0)