Added support for multiple, read-only, caching directories.

jmfernandez · jmfernandez · commit 20918d65323b · 2025-03-03T19:34:00.000+01:00
diff --git a/cached-translated-groovy3-parser.py b/cached-translated-groovy3-parser.py
@@ -360,11 +360,16 @@ def analyze_nf_source(
     jsonfile: "str",
     resultfile: "str",
     cache_directory: "Optional[str]" = None,
+    ro_cache_directories: "Sequence[str]" = [],
 ) -> "Union[RuleNode, LeafNode, EmptyNode]":
     with open(filename, mode="r", encoding="utf-8") as wfH:
         content = wfH.read()
 
-    t_tree = parse_and_digest_groovy_content(content, cache_directory=cache_directory)
+    t_tree = parse_and_digest_groovy_content(
+        content,
+        cache_directory=cache_directory,
+        ro_cache_directories=ro_cache_directories,
+    )
 
     # These are for debugging purposes
     # logging.debug(tree.pretty())
@@ -409,6 +414,16 @@ def analyze_nf_source(
         print(
             "[WARNING] No caching is done. If you want to cache parsed content declare variable GROOVY_CACHEDIR"
         )
+
+    ro_cache_directories = []
+    cache_directory_ro = os.environ.get("GROOVY_CACHEDIRS_RO")
+    if cache_directory_ro is not None:
+        print(f"* Using as read-only caching directories {cache_directory_ro}")
+        ro_cache_directories = cache_directory_ro.split(":")
+    else:
+        print(
+            "[WARNING] No read-only caching is used. If you want to use cached parsed contents declare variable GROOVY_CACHEDIRS_RO, separating more than one path by colons"
+        )
     for filename in sys.argv[1:]:
         print(f"* Parsing {filename}")
         logfile = filename + ".lark"
@@ -420,7 +435,11 @@ def analyze_nf_source(
         log.addHandler(fH)  # set the new handler
         try:
             analyze_nf_source(
-                filename, jsonfile, resultfile, cache_directory=cache_directory
+                filename,
+                jsonfile,
+                resultfile,
+                cache_directory=cache_directory,
+                ro_cache_directories=ro_cache_directories,
             )
         except Exception as e:
             print(f"\tParse failed, see {logfile}")
diff --git a/groovy_parser/__init__.py b/groovy_parser/__init__.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández
+# Copyright (C) 2025 Barcelona Supercomputing Center, José M. Fernández
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 # limitations under the License.
 
 __author__ = "José M. Fernández <https://orcid.org/0000-0002-4806-5140>"
-__copyright__ = "© 2024 Barcelona Supercomputing Center (BSC), ES"
+__copyright__ = "© 2025 Barcelona Supercomputing Center (BSC), ES"
 __license__ = "Apache-2.0"
 
 # https://www.python.org/dev/peps/pep-0396/
-__version__ = "0.1.2"
+__version__ = "0.2.0"
diff --git a/groovy_parser/parser.py b/groovy_parser/parser.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández
+# Copyright (C) 2025 Barcelona Supercomputing Center, José M. Fernández
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 import json
 import os
 import os.path
+import pathlib
 from typing import (
     cast,
     TYPE_CHECKING,
@@ -157,6 +158,7 @@ def create_groovy_parser() -> "Lark":
             #    parser='lalr',
             #    debug=True,
             start="compilation_unit",
+            # ambiguity='explicit',
             # lexer_callbacks={
             #    'square_bracket_block': jarlmethod
             # }
@@ -217,13 +219,21 @@ def digest_lark_tree(
 
 def parse_and_digest_groovy_content(
     content: "str",
-    cache_directory: "Optional[str]" = None,
+    ro_cache_directories: "Optional[Sequence[Union[str, os.PathLike[str]]]]" = None,
+    cache_directory: "Optional[Union[str, os.PathLike[str]]]" = None,
     prune: "Sequence[str]" = ["sep", "nls"],
     noflat: "Sequence[str]" = ["script_statement"],
 ) -> "Union[RuleNode, LeafNode, EmptyNode]":
     t_tree: "Optional[Union[RuleNode, LeafNode, EmptyNode]]" = None
-    hashfile: "Optional[str]" = None
-    if cache_directory is not None and os.path.isdir(cache_directory):
+    hashpath: "Optional[pathlib.Path]" = None
+    cache_path: "Optional[pathlib.Path]" = None
+    if cache_directory is not None:
+        if isinstance(cache_directory, pathlib.Path):
+            cache_path = cache_directory
+        else:
+            cache_path = pathlib.Path(cache_directory)
+
+    if cache_path is not None and cache_path.is_dir():
         h = hashlib.sha256()
         buff = bytearray(BLOCK_SIZE)
 
@@ -246,31 +256,57 @@ def parse_and_digest_groovy_content(
         # Now we can obtain the relative directory, unique to this
         # version of the software and its dependencies
         hreldir = h.copy().hexdigest()
-        this_cache_directory = os.path.join(cache_directory, hreldir)
-        os.makedirs(this_cache_directory, exist_ok=True)
 
-        # Now, let's go for the content signature
-        h.update(content.encode("utf-8"))
-        hashfile = os.path.join(this_cache_directory, h.hexdigest() + ".json.gz")
+        ro_cache_paths: "MutableSequence[pathlib.Path]" = []
+        if ro_cache_directories is not None:
+            for ro_cache_directory in ro_cache_directories:
+                if isinstance(ro_cache_directory, pathlib.Path):
+                    ro_cache_path = ro_cache_directory
+                else:
+                    ro_cache_path = pathlib.Path(ro_cache_directory)
+
+                # Include only existing cache paths
+                this_ro_cache_path = ro_cache_path / hreldir
+                if this_ro_cache_path.is_dir():
+                    ro_cache_paths.append(this_ro_cache_path)
 
-        if os.path.isfile(hashfile):
-            try:
-                with gzip.open(hashfile, mode="rt", encoding="utf-8") as jH:
-                    t_tree = json.load(jH)
-            except:
-                # If it is unreadable, re-create
-                pass
+        this_cache_path = cache_path / hreldir
+        this_cache_path.mkdir(parents=True, exist_ok=True)
 
-    if t_tree is None:
+        ro_cache_paths.append(this_cache_path)
+
+        # Now, let's go for the content signature
+        h.update(content.encode("utf-8"))
+        rel_hashpath = h.hexdigest() + ".json.gz"
+
+        # This is needed in case nothing was available
+        hashpath = this_cache_path / rel_hashpath
+        for ro_cache_path in ro_cache_paths:
+            ro_hashpath = ro_cache_path / rel_hashpath
+            if ro_hashpath.is_file():
+                try:
+                    with gzip.open(
+                        ro_hashpath.as_posix(), mode="rt", encoding="utf-8"
+                    ) as jH:
+                        t_tree = json.load(jH)
+                    hashpath = None
+                    break
+                except:
+                    # If it is unreadable, re-create
+                    pass
+
+    if t_tree is None and (hashpath is not None or cache_path is None):
         tree = parse_groovy_content(content)
         t_tree = LarkFilteringTreeEncoder().default(
             tree,
             prune=prune,
             noflat=noflat,
         )
 
-        if hashfile is not None:
-            with gzip.open(hashfile, mode="wt", encoding="utf-8") as jH:
-                json.dump(t_tree, jH, sort_keys=True)
+    assert t_tree is not None
+
+    if hashpath is not None:
+        with gzip.open(hashpath.as_posix(), mode="wt", encoding="utf-8") as jH:
+            json.dump(t_tree, jH, sort_keys=True)
 
     return t_tree