Skip to content

Commit af3ff62

Browse files
bug fix for chunk_size and overlap cause error in dataprep ingestion (#1643)
* bug fix for dataingest url Signed-off-by: Mustafa <[email protected]> * add validation function Signed-off-by: Mustafa <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * validation update Signed-off-by: Mustafa <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update validation function Signed-off-by: Mustafa <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mustafa <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c8b7e3c commit af3ff62

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

comps/dataprep/src/utils.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,11 +667,55 @@ def parse_html(input):
667667
return chucks
668668

669669

670+
def validate_and_convert_chunk_params(chunk_size, chunk_overlap):
671+
"""Validate and convert chunk_size and chunk_overlap to integers if they are strings.
672+
673+
Ensure chunk_size is a positive integer, chunk_overlap is a non-negative integer,
674+
and chunk_overlap is not larger than chunk_size.
675+
"""
676+
677+
def validate_param_instance(param, param_name):
678+
"""Validate that the parameter is an integer or a string that can be converted to an integer.
679+
680+
Raise a ValueError if the validation fails.
681+
"""
682+
if not isinstance(param, (int, str)):
683+
raise ValueError(f"{param_name} must be an integer or a string representing an integer.")
684+
685+
if isinstance(param, str):
686+
try:
687+
return int(param) # Attempt to convert the string to an integer
688+
except ValueError:
689+
raise ValueError(f"{param_name} must be an integer or a string that can be converted to an integer.")
690+
else:
691+
return param
692+
693+
# Validate chunk_size and chunk_overlap, Convert to integers if they are strings
694+
chunk_size = validate_param_instance(chunk_size, "chunk_size")
695+
chunk_overlap = validate_param_instance(chunk_overlap, "chunk_overlap")
696+
697+
def validate_param_value(param, param_name, min_value):
698+
if param < min_value:
699+
raise ValueError(f"{param_name} must be a {min_value} or greater.")
700+
701+
# Validate chunk_size and chunk_overlap
702+
validate_param_value(chunk_size, "chunk_size", 1)
703+
validate_param_value(chunk_overlap, "chunk_overlap", 0)
704+
705+
# Ensure chunk_overlap is not larger than chunk_size
706+
if chunk_overlap > chunk_size:
707+
raise ValueError("chunk_overlap cannot be larger than chunk_size.")
708+
709+
return chunk_size, chunk_overlap
710+
711+
670712
def load_html_content(links, chunk_size=1500, chunk_overlap=50):
671713
from langchain.text_splitter import RecursiveCharacterTextSplitter
672714
from langchain_community.document_loaders import AsyncHtmlLoader
673715
from langchain_community.document_transformers import Html2TextTransformer
674716

717+
chunk_size, chunk_overlap = validate_and_convert_chunk_params(chunk_size, chunk_overlap)
718+
675719
loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True)
676720
docs = loader.load()
677721
html2text = Html2TextTransformer()

tests/dataprep/dataprep_utils.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ function _invoke_curl() {
3939
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
4040
}
4141

42-
#
42+
4343
function _add_db_params() {
4444
local db=$1
4545
if [[ "$db" == "redis" ]]; then
@@ -102,6 +102,14 @@ function ingest_external_link() {
102102
_invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' $extra_args $@
103103
}
104104

105+
function ingest_external_link_with_chunk_parameters() {
106+
local fqdn=$1
107+
local port=$2
108+
local index_name=$3
109+
shift 3
110+
_invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' -F "chunk_size=1500" -F "chunk_overlap=100" -F "index_name=${index_name}" $@
111+
}
112+
105113
function delete_all() {
106114
local fqdn=$1
107115
local port=$2

tests/dataprep/test_dataprep_redis.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ function validate_microservice() {
7777
ingest_external_link ${ip_address} ${DATAPREP_PORT}
7878
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
7979

80+
ingest_external_link_with_chunk_parameters ${ip_address} ${DATAPREP_PORT} "rag_redis_test_link_params"
81+
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
82+
8083
ingest_txt_with_index_name ${ip_address} ${DATAPREP_PORT} rag_redis_test
8184
check_result "dataprep - upload with index - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
8285

0 commit comments

Comments
 (0)