DAGWorks-Inc
diff --git a/‎contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
+1-1 b/‎contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
+1-1
diff --git a/‎contrib/hamilton/contrib/user/zilto/lancedb_vdb/__init__.py
+2-2 b/‎contrib/hamilton/contrib/user/zilto/lancedb_vdb/__init__.py
+2-2
diff --git a/‎contrib/hamilton/contrib/user/zilto/nixtla_statsforecast/__init__.py
+2-2 b/‎contrib/hamilton/contrib/user/zilto/nixtla_statsforecast/__init__.py
+2-2
diff --git a/‎contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
+2-2 b/‎contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
+2-2
diff --git a/‎contrib/hamilton/contrib/user/zilto/xgboost_optuna/__init__.py
+3-1 b/‎contrib/hamilton/contrib/user/zilto/xgboost_optuna/__init__.py
+3-1
diff --git a/‎contrib/setup.py
+2-2 b/‎contrib/setup.py
+2-2
diff --git a/‎examples/LLM_Workflows/GraphRAG/ingest_fighters.py
+1-1 b/‎examples/LLM_Workflows/GraphRAG/ingest_fighters.py
+1-1
diff --git a/‎examples/LLM_Workflows/image_telephone/streamlit.py
+1-2 b/‎examples/LLM_Workflows/image_telephone/streamlit.py
+1-2
diff --git a/‎examples/LLM_Workflows/knowledge_retrieval/state.py
+1-1 b/‎examples/LLM_Workflows/knowledge_retrieval/state.py
+1-1
diff --git a/‎examples/LLM_Workflows/knowledge_retrieval/summarize_text.py
+1-1 b/‎examples/LLM_Workflows/knowledge_retrieval/summarize_text.py
+1-1
diff --git a/‎examples/LLM_Workflows/retrieval_augmented_generation/backend/server.py
+1-1 b/‎examples/LLM_Workflows/retrieval_augmented_generation/backend/server.py
+1-1
diff --git a/‎examples/LLM_Workflows/scraping_and_chunking/spark/doc_pipeline.py
+2-2 b/‎examples/LLM_Workflows/scraping_and_chunking/spark/doc_pipeline.py
+2-2
diff --git a/‎examples/LLM_Workflows/scraping_and_chunking/spark/spark_pipeline.py
+3-1 b/‎examples/LLM_Workflows/scraping_and_chunking/spark/spark_pipeline.py
+3-1
diff --git a/‎examples/dagster/dagster_code/tutorial/assets.py
+1-1 b/‎examples/dagster/dagster_code/tutorial/assets.py
+1-1
diff --git a/‎examples/dagster/dagster_code/tutorial/resources/__init__.py
+1-1 b/‎examples/dagster/dagster_code/tutorial/resources/__init__.py
+1-1
diff --git a/‎examples/dagster/hamilton_code/dataflow.py
+1-1 b/‎examples/dagster/hamilton_code/dataflow.py
+1-1
diff --git a/‎examples/dagster/hamilton_code/mock_api.py
+1-1 b/‎examples/dagster/hamilton_code/mock_api.py
+1-1
diff --git a/‎examples/decoupling_io/adapters.py
+2-2 b/‎examples/decoupling_io/adapters.py
+2-2
diff --git a/‎examples/dlt/slack/__init__.py
+16-12 b/‎examples/dlt/slack/__init__.py
+16-12
diff --git a/‎examples/due_date_probabilities/probability_estimation.py
+2-3 b/‎examples/due_date_probabilities/probability_estimation.py
+2-3
diff --git a/‎examples/people_data_labs/analysis.py
+1-1 b/‎examples/people_data_labs/analysis.py
+1-1
diff --git a/‎examples/prefect/run.py
+2-2 b/‎examples/prefect/run.py
+2-2
diff --git a/‎examples/spark/world_of_warcraft/zone_features__spark_v1.py
+3-4 b/‎examples/spark/world_of_warcraft/zone_features__spark_v1.py
+3-4
diff --git a/‎hamilton/cli/__main__.py
+3-3 b/‎hamilton/cli/__main__.py
+3-3
diff --git a/‎hamilton/cli/logic.py
+5-5 b/‎hamilton/cli/logic.py
+5-5
diff --git a/‎hamilton/dataflows/__init__.py
+2-2 b/‎hamilton/dataflows/__init__.py
+2-2
diff --git a/‎hamilton/execution/executors.py
+1-1 b/‎hamilton/execution/executors.py
+1-1
@@ -262,7 +262,7 @@ def construct_df(
     negatives_per_positive: int = 1,
     random_seed: int = 123,
 ) -> pd.DataFrame:
-    f"""Return dataframe of {base_df} paris with negatives added."""
+    """Return dataframe of {base_df} paris with negatives added."""
     return pd.concat(
         [
             base_df,
 
@@ -59,9 +59,9 @@ def table_ref(
 
     try:
         table = client.open_table(table_name)
-    except FileNotFoundError:
+    except FileNotFoundError as e:
         if schema is None:
-            raise ValueError("`schema` must be provided to create table.")
+            raise ValueError("`schema` must be provided to create table.") from e
 
         table = _create_table(
             client=client,
 
@@ -125,7 +125,7 @@ def best_model_per_series(cross_validation_evaluation: pd.DataFrame) -> pd.Serie
 def inference_predictions(
     forecaster: StatsForecast,
     inference_forecast_steps: int = 12,
-    inference_confidence_percentile: list[float] = [90.0],
+    inference_confidence_percentile: list[float] = [90.0],  # noqa: B006
 ) -> pd.DataFrame:
     """Infer values using the training harness. Fitted models aren't stored
 
@@ -141,7 +141,7 @@ def plotting_config(
     plot_uids: Optional[list[str]] = None,
     plot_models: Optional[list[str]] = None,
     plot_anomalies: bool = False,
-    plot_confidence_percentile: list[float] = [90.0],
+    plot_confidence_percentile: list[float] = [90.0],  # noqa: B006
     plot_engine: str = "matplotlib",
 ) -> dict:
     """Configuration for plotting functions"""
 
@@ -54,8 +54,8 @@ def html_page(url: str) -> str:
 def parsed_html(
     url: str,
     html_page: str,
-    tags_to_extract: List[str] = ["p", "li", "div"],
-    tags_to_remove: List[str] = ["script", "style"],
+    tags_to_extract: List[str] = ["p", "li", "div"],  # noqa: B006
+    tags_to_remove: List[str] = ["script", "style"],  # noqa: B006
 ) -> ParsingResult:
     """Parse an HTML string using BeautifulSoup
 
 
@@ -133,7 +133,7 @@ def cross_validation_folds(
 
 def study(
     higher_is_better: bool,
-    pruner: Optional[optuna.pruners.BasePruner] = optuna.pruners.MedianPruner(),
+    pruner: Optional[optuna.pruners.BasePruner] = None,
     sampler: Optional[optuna.samplers.BaseSampler] = None,
     study_storage: Optional[str] = None,
     study_name: Optional[str] = None,
@@ -142,6 +142,8 @@ def study(
     """Create an optuna study; use the XGBoost + Optuna integration for pruning
     ref: https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_integration.py
     """
+    if pruner is None:
+        pruner = optuna.pruners.MedianPruner()
     return optuna.create_study(
         direction="maximize" if higher_is_better else "minimize",
         pruner=pruner,
 
@@ -10,8 +10,8 @@
 try:
     with open("README.md") as readme_file:
         readme = readme_file.read()
-except Exception:
-    warnings.warn("README.md not found")
+except FileNotFoundError:
+    warnings.warn("README.md not found")  # noqa
     readme = None
 
 REQUIREMENTS_FILES = ["requirements.txt"]
 
@@ -17,7 +17,7 @@ def raw_fighter_details() -> pd.DataFrame:
 
 def fighter(raw_fighter_details: pd.DataFrame) -> Parallelizable[pd.Series]:
     """We then want to do something for each record. That's what this code sets up"""
-    for idx, row in raw_fighter_details.iterrows():
+    for _, row in raw_fighter_details.iterrows():
         yield row
 
 
 
@@ -403,8 +403,7 @@ def explore_display():
         image_urls_to_display = image_urls[0 : len(projection)]
         if len(image_urls_to_display) != len(projection):
             image_url_length = len(image_urls_to_display)
-            for i in range(len(projection) - len(image_urls_to_display)):
-                image_urls_to_display.append(image_urls[image_url_length - 1])
+            image_urls_to_display.append(image_urls[image_url_length - 1])
         embedding_path_plot(projection, image_urls_to_display, selected_entry, prompt_path)
         # highlight_point(projection, selected_entry)
 
 
@@ -137,7 +137,7 @@ def call_arxiv_function(messages, full_message):
             return response
         except Exception as e:
             logger.error(type(e))
-            raise Exception("Function chat request failed")
+            raise Exception("Function chat request failed") from e
 
     elif full_message["message"]["function_call"]["name"] == "read_article_and_summarize":
         parsed_output = json.loads(full_message["message"]["function_call"]["arguments"])
 
@@ -56,7 +56,7 @@ def pdf_text(pdf_path: pd.Series) -> pd.Series:
     :return: Series of strings of the PDFs' contents
     """
     _pdf_text = []
-    for i, file_path in pdf_path.items():
+    for _i, file_path in pdf_path.items():
         # creating a pdf reader object
         reader = PdfReader(file_path)
         text = ""
 
@@ -59,7 +59,7 @@ class SummaryResponse(pydantic.BaseModel):
 
 
 @app.post("/store_arxiv", tags=["Ingestion"])
-async def store_arxiv(arxiv_ids: list[str] = fastapi.Form(...)) -> JSONResponse:
+async def store_arxiv(arxiv_ids: list[str] = fastapi.Form(...)) -> JSONResponse:  # noqa: B008
     """Retrieve PDF files of arxiv articles for arxiv_ids\n
     Read the PDF as text, create chunks, and embed them using OpenAI API\n
     Store chunks with embeddings in Weaviate.
 
@@ -27,8 +27,8 @@ def article_text(url: str, article_regex: str) -> str:
     """
     try:
         html = requests.get(url)
-    except requests.exceptions.RequestException:
-        raise Exception(f"Failed to get URL: {url}")
+    except requests.exceptions.RequestException as e:
+        raise Exception(f"Failed to get URL: {url}") from e
     article = re.findall(article_regex, html.text, re.DOTALL)
     if not article:
         raise ValueError(f"No article found in {url}")
 
@@ -27,7 +27,9 @@ def sitemap_text(sitemap_url: str = "https://hamilton.dagworks.io/en/latest/site
     try:
         sitemap = requests.get(sitemap_url)
     except Exception as e:
-        raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}")
+        raise RuntimeError(
+            f"Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}"
+        ) from e
     return sitemap.text
 
 
 
@@ -55,7 +55,7 @@ def most_frequent_words() -> MaterializeResult:
     for raw_title in topstories["title"]:
         title = raw_title.lower()
         for word in title.split():
-            cleaned_word = word.strip(".,-!?:;()[]'\"-")
+            cleaned_word = word.strip(".,-!?:;()[]'\"-")  # noqa
             if cleaned_word not in stopwords and len(cleaned_word) > 0:
                 word_counts[cleaned_word] = word_counts.get(cleaned_word, 0) + 1
 
 
@@ -93,7 +93,7 @@ def get_signups_for_date(self, date: datetime) -> Sequence[Signup]:
         signups = []
         num_signups = self.random.randint(25, 100)
 
-        for i in range(num_signups):
+        for _ in range(num_signups):
             signup = self.generate_signup(date)
             signups.append(signup.to_dict())
 
 
@@ -31,7 +31,7 @@ def most_frequent_words(title: pd.Series) -> dict[str, int]:
     word_counts = {}
     for raw_title in title:
         for word in raw_title.lower().split():
-            word = word.strip(".,-!?:;()[]'\"-")
+            word = word.strip(".,-!?:;()[]'\"-")  # noqa
             if len(word) == 0:
                 continue
 
 
@@ -94,7 +94,7 @@ def get_signups_for_date(self, date: datetime) -> Sequence[Signup]:
         signups = []
         num_signups = self.random.randint(25, 100)
 
-        for i in range(num_signups):
+        for _ in range(num_signups):
             signup = self.generate_signup(date)
             signups.append(signup.to_dict())
 
 
@@ -7,8 +7,8 @@
     import sklearn.inspection
     import sklearn.metrics
     import sklearn.model_selection
-except ImportError:
-    raise NotImplementedError("scikit-learn is not installed.")
+except ImportError as e:
+    raise NotImplementedError("scikit-learn is not installed.") from e
 
 
 from hamilton import registry
 
@@ -168,12 +168,7 @@ def get_thread_replies(messages: List[Dict[str, Any]]) -> Iterable[TDataItem]:
         write_disposition=write_disposition,
     )
     def messages_resource(
-        created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
-            "ts",
-            initial_value=start_dt,
-            end_value=end_dt,
-            allow_external_schedulers=True,
-        ),
+        created_at: dlt.sources.incremental[DateTime] = None,
     ) -> Iterable[TDataItem]:
         """
         Yield all messages for a set of selected channels as a DLT resource. Keep blocks column without normalization.
@@ -184,19 +179,21 @@ def messages_resource(
         Yields:
             Iterable[TDataItem]: A list of messages.
         """
+        if created_at is None:
+            created_at = dlt.sources.incremental(
+                "ts",
+                initial_value=start_dt,
+                end_value=end_dt,
+                allow_external_schedulers=True,
+            )
         start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
         end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
         for channel_data in fetched_selected_channels:
             yield from get_messages(channel_data, start_date_ts, end_date_ts)
 
     def per_table_messages_resource(
         channel_data: Dict[str, Any],
-        created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
-            "ts",
-            initial_value=start_dt,
-            end_value=end_dt,
-            allow_external_schedulers=True,
-        ),
+        created_at: dlt.sources.incremental[DateTime] = None,
     ) -> Iterable[TDataItem]:
         """Yield all messages for a given channel as a DLT resource. Keep blocks column without normalization.
 
@@ -207,6 +204,13 @@ def per_table_messages_resource(
         Yields:
             Iterable[TDataItem]: A list of messages.
         """
+        if created_at is None:
+            created_at = dlt.sources.incremental(
+                "ts",
+                initial_value=start_dt,
+                end_value=end_dt,
+                allow_external_schedulers=True,
+            )
         start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
         end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
         yield from get_messages(channel_data, start_date_ts, end_date_ts)
 
@@ -125,10 +125,9 @@ def raw_probabilities(raw_data: str) -> pd.DataFrame:
 
 def resampled(raw_probabilities: pd.DataFrame) -> List[int]:
     sample_data = []
-    for index, row in raw_probabilities.iterrows():
+    for _idx, row in raw_probabilities.iterrows():
         count = row.probability * 1000
-        for i in range(int(count)):
-            sample_data.append(row.days)
+        sample_data.extend([row.days] * int(count))
     return sample_data
 
 
 
@@ -115,7 +115,7 @@ def stock_growth_rate_since_last_funding_round(
     df = pd.merge(left=stock_data, right=period_start, on="ticker", how="inner")
 
     stock_growth = dict()
-    for idx, row in df.iterrows():
+    for _, row in df.iterrows():
         history = pd.json_normalize(row["historical_price"]).astype({"date": "datetime64[ns]"})
 
         # skip ticker if history is empty
 
@@ -72,15 +72,15 @@ def train_and_evaluate_model_task(
 )
 def absenteeism_prediction_flow(
     raw_data_location: str = "./data/Absenteeism_at_work.csv",
-    feature_set: list[str] = [
+    feature_set: list[str] = [  # noqa: B006
         "age_zero_mean_unit_variance",
         "has_children",
         "has_pet",
         "is_summer",
         "service_time",
     ],
     label: str = "absenteeism_time_in_hours",
-    validation_user_ids: list[str] = [
+    validation_user_ids: list[str] = [  # noqa: B006
         "1",
         "2",
         "4",
 
@@ -12,10 +12,9 @@ def world_of_warcraft(spark_session: ps.SparkSession) -> ps.DataFrame:
 
 def zone_flags(world_of_warcraft: ps.DataFrame) -> ps.DataFrame:
     zone_flags = world_of_warcraft
-    for zone in ["durotar", "darkshore"]:
-        zone_flags = zone_flags.withColumn(
-            "darkshore_flag", sf.when(sf.col("zone") == " Darkshore", 1).otherwise(0)
-        ).withColumn("durotar_flag", sf.when(sf.col("zone") == " Durotar", 1).otherwise(0))
+    zone_flags = zone_flags.withColumn(
+        "darkshore_flag", sf.when(sf.col("zone") == " Darkshore", 1).otherwise(0)
+    ).withColumn("durotar_flag", sf.when(sf.col("zone") == " Durotar", 1).otherwise(0))
     return zone_flags
 
 
 
@@ -127,7 +127,7 @@ def _try_command(cmd: Callable, **cmd_kwargs) -> Any:
             command=cmd_name, success=False, message={"error": str(type(e)), "details": str(e)}
         )
         logger.error(dataclasses.asdict(response))
-        raise typer.Exit(code=1)
+        raise typer.Exit(code=1) from e
 
     return result
 
@@ -297,12 +297,12 @@ def ui(
     """Runs the Hamilton UI on sqllite in port 8241"""
     try:
         from hamilton_ui import commands
-    except ImportError:
+    except ImportError as e:
         logger.error(
             "hamilton[ui] not installed -- you have to install this to run the UI. "
             'Run `pip install "sf-hamilton[ui]"` to install and get started with the UI!'
         )
-        raise typer.Exit(code=1)
+        raise typer.Exit(code=1) from e
 
     ctx.invoke(
         commands.run,
 
@@ -27,8 +27,8 @@ def get_git_base_directory() -> str:
         else:
             print("Error:", result.stderr.strip())
             raise OSError(f"{result.stderr.strip()}")
-    except FileNotFoundError:
-        raise FileNotFoundError("Git command not found. Please make sure Git is installed.")
+    except FileNotFoundError as e:
+        raise FileNotFoundError("Git command not found. Please make sure Git is installed.") from e
 
 
 def get_git_reference(git_relative_path: Union[str, Path], git_reference: str) -> str:
@@ -51,8 +51,8 @@ def get_git_reference(git_relative_path: Union[str, Path], git_reference: str) -
             return
         else:
             return
-    except FileNotFoundError:
-        raise FileNotFoundError("Git command not found. Please make sure Git is installed.")
+    except FileNotFoundError as e:
+        raise FileNotFoundError("Git command not found. Please make sure Git is installed.") from e
 
 
 def version_hamilton_functions(module: ModuleType) -> Dict[str, str]:
@@ -184,7 +184,7 @@ def diff_versions(current_map: Dict[str, str], reference_map: Dict[str, str]) ->
         if v1 != v2:
             edit.append(node_name)
 
-    for node_name, v2 in reference_map.items():
+    for node_name, _ in reference_map.items():
         v1 = current_map.get(node_name)
         if v1 is None:
             reference_only.append(node_name)
 
@@ -498,10 +498,10 @@ def are_py_dependencies_satisfied(dataflow, user=None, version="latest"):
             else:
                 package_name = line
                 required_version = None
-            required_version  # here for now...
+            required_version  # noqa here for now...
             try:
                 installed_version = pkg_version(package_name)
-                installed_version  # here for now..
+                installed_version  # noqa here for now..
             except PackageNotFoundError:
                 logger.info(f"Package '{package_name}' is not installed.")
                 return False
 
@@ -99,7 +99,7 @@ def base_execute_task(task: TaskImplementation) -> Dict[str, Any]:
     for node_ in task.nodes:
         if not getattr(node_, "callable_modified", False):
             node_._callable = _modify_callable(node_.node_role, node_.callable)
-        setattr(node_, "callable_modified", True)
+        node_.callable_modified = True
     if task.adapter.does_hook("pre_task_execute", is_async=False):
         task.adapter.call_all_lifecycle_hooks_sync(
             "pre_task_execute",
Original file line number	Diff line number	Diff line change
`@@ -262,7 +262,7 @@ def construct_df(`
`262`	`262`	`negatives_per_positive: int = 1,`
`263`	`263`	`random_seed: int = 123,`
`264`	`264`	`) -> pd.DataFrame:`
`265`		`- f"""Return dataframe of {base_df} paris with negatives added."""`
	`265`	`+ """Return dataframe of {base_df} paris with negatives added."""`
`266`	`266`	`return pd.concat(`
`267`	`267`	`[`
`268`	`268`	`base_df,`