Merge pull request #249 from gavargas22/feat/parallel_processing

frank1010111 · web-flow · commit 6ed78b533ac7 · 2025-07-15T23:15:50.000-04:00
Using concurrent futures for processing multiple las files in parallel
diff --git a/.gitignore b/.gitignore
@@ -71,3 +71,7 @@ csv-plugin.xml
 
 # temporary test folder
 tests/temp
+
+# UV related files
+uv.lock
+.python-version
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,3 @@
-[build-system]
-requires = ["hatchling", "hatch-vcs"]
-build-backend = "hatchling.build"
-
 [project]
 name = "welly"
 dynamic = ["version"]
@@ -22,6 +18,7 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
     "Operating System :: OS Independent"
 ]
+requires-python = ">=3.9"
 dependencies = [
     "numpy",
     "scipy",
@@ -61,3 +58,7 @@ testpaths = ["tests"]
 [tool.setuptools_scm]
 write_to = "welly/_version.py"
 git_describe_command = "git describe --dirty --tags --long --match v* --first-parent"
+
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
diff --git a/welly/las.py b/welly/las.py
@@ -482,11 +482,11 @@ def file_from_url(url):
         text_file (StringIO): an in-memory stream for text.
     """
     try:
-        text_file = StringIO(request.urlopen(url).read().decode())
+        with request.urlopen(url) as response:
+            content = response.read().decode()
+        return content  # Return the content directly instead of wrapping in StringIO
     except error.HTTPError as e:
-        raise Exception('Could not retrieve url: ', e)
-
-    return text_file
+        raise Exception(f'Could not retrieve url: {url} - {e}')
 
 
 def get_las_version(las):
diff --git a/welly/project.py b/welly/project.py
@@ -21,6 +21,26 @@
 from .plot import plot_kdes_project, plot_map_project
 
 
+def _load_well_from_las(filepath, remap=None, funcs=None, data=True, req=None, alias=None, encoding=None, printfname=None, index=None, **kwargs):
+    """Helper function for concurrent well loading."""
+    try:
+        # Handle URLs directly in the subprocess to avoid file handle issues
+        # when passing file objects between processes
+        return Well.from_las(filepath,
+                            remap=remap,
+                            funcs=funcs,
+                            data=data,
+                            req=req,
+                            alias=alias,
+                            encoding=encoding,
+                            printfname=printfname,
+                            index=index,
+                            **kwargs)
+    except Exception as e:
+        print(f"Error loading well {filepath}: {e}")
+        return None
+
+
 class Project(object):
     """
     Just a list of Well objects.
@@ -163,6 +183,9 @@ def from_las(cls,
         Returns:
             project. The project object.
         """
+        import concurrent.futures
+        from tqdm import tqdm
+        
         if max is None:
             max = 1e12
         if (req is not None) and (alias is None):
@@ -180,20 +203,24 @@ def from_las(cls,
         else:
             uris = path  # It's a list-like of files and/or URLs.
 
-        wells = [Well.from_las(f,
-                               remap=remap,
-                               funcs=funcs,
-                               data=data,
-                               req=req,
-                               alias=alias,
-                               encoding=encoding,
-                               printfname=printfname,
-                               index=index,
-                               **kwargs,
-                               )
-                 for i, f in tqdm(enumerate(uris)) if i < max]
-
-        return cls(list(filter(None, wells)))
+        # Limit to the maximum number of wells requested
+        uris = [f for i, f in enumerate(uris) if i < max]
+        
+        wells = []
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            # Submit all tasks and create a mapping of futures to original indices
+            future_to_idx = {executor.submit(_load_well_from_las, uri, remap=remap, funcs=funcs, data=data, req=req, alias=alias, encoding=encoding, printfname=printfname, index=index, **kwargs): i for i, uri in enumerate(uris)}
+            
+            # Use tqdm to show a progress bar
+            for future in tqdm(concurrent.futures.as_completed(future_to_idx), total=len(uris), desc="Loading wells"):
+                try:
+                    well = future.result()
+                    if well is not None:
+                        wells.append(well)
+                except Exception as e:
+                    print(f"Error loading well: {e}")
+        
+        return cls(wells, source=path)
 
     def add_canstrat_striplogs(self, path, uwi_transform=None, name='canstrat'):
         """
diff --git a/welly/well.py b/welly/well.py
@@ -298,11 +298,15 @@ def from_las(cls,
         if printfname:
             print(fname)
 
-        # If https URL is passed try reading and formatting it to text file.
-        if re.match(r'https?://.+\..+/.+?', fname) is not None:
-            fname = file_from_url(fname)
-
-        datasets = from_las(fname, encoding=encoding, **kwargs)
+        # If https URL is passed, download the content
+        is_url = re.match(r'https?://.+\..+/.+?', fname) is not None
+        if is_url:
+            content = file_from_url(fname)
+            # Pass the content string directly to from_las
+            datasets = from_las(content, encoding=encoding, **kwargs)
+        else:
+            # Regular file path
+            datasets = from_las(fname, encoding=encoding, **kwargs)
 
         # Create well from datasets.
         well = cls.from_datasets(datasets,