Merge branch '__rultor'

rultor · rultor · commit 5bd7d59034ea · 2024-12-30T08:03:27.000Z
diff --git a/sr-data/src/sr_data/steps/workflows.py b/sr-data/src/sr_data/steps/workflows.py
@@ -1,6 +1,7 @@
 """
 Collect information about GitHub workflows in the repo.
 """
+import numpy as np
 # The MIT License (MIT)
 #
 # Copyright (c) 2024 Aliaksei Bialiauski
@@ -30,7 +31,7 @@
 
 def main(repos, out):
     frame = pd.read_csv(repos)
-    frame["workflows"] = frame["workflows"].fillna("")
+    frame["workflows"] = frame["workflows"].fillna(0)
     for idx, row in frame.iterrows():
         repo = row["repo"]
         branch = row["branch"]
@@ -65,13 +66,42 @@ def main(repos, out):
             if info["w_release"]:
                 releases = True
         frame.at[idx, "workflows"] = len(ymls)
+        frame["workflows"] = frame["workflows"]
         frame.at[idx, "w_jobs"] = tjobs
         frame.at[idx, "w_oss"] = len(set(oss))
         frame.at[idx, "w_steps"] = steps
-        frame.at[idx, "has_release_workflow"] = releases
+        frame.at[idx, "has_release_workflow"] = int(releases)
+        frame.at[idx, "w_simplicity"] = w_score(frame.loc[idx])
     frame.to_csv(out, index=False)
     logger.info(f"Saved repositories to {out}")
 
+wscope = ["workflows", "w_jobs", "w_oss", "w_steps", "has_release_workflow"]
+weights = {
+    "workflows": 0.3,
+    "w_jobs": 0.25,
+    "w_steps": 0.25,
+    "w_oss": 0.1,
+    "has_release_workflow": 0.1,
+}
+
+def w_score(row) -> int:
+    """
+    Workflow simplicity score.
+    :return: Calculated metric for workflow simplicity score.
+    @todo #244:35min Enhance workflow simplicity score with min and max adjustment.
+     Currently, we just subtract collected value from 1. We should adjust it with
+     min and max values from the dataset. So formula should look like:
+     1 - (row - min) / (max - min).
+    """
+    normalized = {
+        "workflows": 1 - row["workflows"],
+        "w_jobs": 1 - row["w_jobs"],
+        "w_steps": 1 - row["w_steps"],
+        "w_oss": 1 - row["w_oss"],
+        "has_release_workflow": 1 - row["has_release_workflow"],
+    }
+    return sum(normalized[key] * weights[key] for key in weights)
+
 
 def fetch(path) -> str:
     return requests.get(f"https://raw.githubusercontent.com/{path}").text
diff --git a/sr-data/src/tests/resources/to-wscore.csv b/sr-data/src/tests/resources/to-wscore.csv
@@ -0,0 +1,2 @@
+repo,workflows,w_jobs,w_steps,w_oss,has_release_workflow
+foo/bar,1,2,3,3,0
diff --git a/sr-data/src/tests/test_workflows.py b/sr-data/src/tests/test_workflows.py
@@ -29,8 +29,7 @@
 import pandas as pd
 import pytest
 import yaml
-from sr_data.steps.workflows import workflow_info, main, fetch, \
-    used_for_releases
+from sr_data.steps.workflows import workflow_info, main, fetch, used_for_releases, w_score
 
 
 class TestWorkflows(unittest.TestCase):
@@ -90,7 +89,7 @@ def test_outputs_workflow_info_correctly(self):
             f"Steps count in workflow: '{info}' does not match with expected"
         )
 
-    @pytest.mark.fast
+    @pytest.mark.nightly
     def test_collects_unique_oss_across_all_files(self):
         with TemporaryDirectory() as temp:
             path = os.path.join(temp, "workflows.csv")
@@ -109,7 +108,7 @@ def test_collects_unique_oss_across_all_files(self):
                 f"OSS count: {oss} does not match with expected: {expected}"
             )
 
-    @pytest.mark.fast
+    @pytest.mark.nightly
     def test_collects_workflows_for_all(self):
         with TemporaryDirectory() as temp:
             path = os.path.join(temp, "workflows.csv")
@@ -129,7 +128,7 @@ def test_collects_workflows_for_all(self):
                 f"Frame {frame.columns} doesn't have expected columns"
             )
 
-    @pytest.mark.fast
+    @pytest.mark.nightly
     def test_counts_workflows_correctly(self):
         with TemporaryDirectory() as temp:
             path = os.path.join(temp, "workflows.csv")
@@ -403,3 +402,18 @@ def test_parses_oss_as_list_in_matrix(self):
             0,
             f"Steps count in workflow: '{info}' does not match with expected"
         )
+
+
+    @pytest.mark.fast
+    def test_calculates_simplicity_score(self):
+        scores = pd.read_csv(
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                "resources/to-wscore.csv"
+            )
+        )
+        self.assertEqual(
+            w_score(scores.iloc[0]),
+            -0.85,
+            "Calculated score does not match with expected"
+        )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+repo,workflows,w_jobs,w_steps,w_oss,has_release_workflow`
	`2`	`+foo/bar,1,2,3,3,0`