From 1e2ad87d86c24cbc58bc454ec42fc21595d23f17 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Wed, 1 May 2024 11:01:26 -0400 Subject: [PATCH 1/7] Add Python wrapper and sample notebooks. --- README.md | 6 + python/.idea/.gitignore | 3 + .../inspectionProfiles/profiles_settings.xml | 6 + python/.idea/misc.xml | 7 + python/.idea/modules.xml | 8 + python/.idea/vcs.xml | 6 + python/README.md | 47 ++++ ...aframe_rules_engine-0.0.1-py3-none-any.whl | Bin 0 -> 5141 bytes .../dist/dataframe_rules_engine-0.0.1.tar.gz | Bin 0 -> 4420 bytes .../01-Generate Mock Purchase Transactions.py | 60 +++++ .../02-Apply Purchase Transaction Rules.py | 152 +++++++++++++ python/examples/PythonWrapper.py | 212 ++++++++++++++++++ python/pyproject.toml | 31 +++ python/src/databricks/__init__.py | 3 + python/src/databricks/labs/__init__.py | 0 .../databricks/labs/validation/__init__.py | 0 .../labs/validation/local_spark_singleton.py | 8 + python/src/databricks/labs/validation/rule.py | 91 ++++++++ .../databricks/labs/validation/rule_set.py | 32 +++ .../databricks/labs/validation/rule_type.py | 9 + .../databricks/labs/validation/structures.py | 51 +++++ .../labs/validation/utils/__init__.py | 0 .../labs/validation/utils/helpers.py | 12 + python/tests/local_spark_singleton.py | 34 +++ python/tests/test_rule.py | 30 +++ python/tests/test_rule_set.py | 145 ++++++++++++ python/tests/test_structures.py | 39 ++++ 27 files changed, 992 insertions(+) create mode 100644 python/.idea/.gitignore create mode 100644 python/.idea/inspectionProfiles/profiles_settings.xml create mode 100644 python/.idea/misc.xml create mode 100644 python/.idea/modules.xml create mode 100644 python/.idea/vcs.xml create mode 100644 python/README.md create mode 100644 python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl create mode 100644 python/dist/dataframe_rules_engine-0.0.1.tar.gz create mode 100644 python/examples/01-Generate Mock Purchase Transactions.py create mode 100644 python/examples/02-Apply Purchase Transaction Rules.py create mode 100644 python/examples/PythonWrapper.py create mode 100644 python/pyproject.toml create mode 100644 python/src/databricks/__init__.py create mode 100644 python/src/databricks/labs/__init__.py create mode 100644 python/src/databricks/labs/validation/__init__.py create mode 100644 python/src/databricks/labs/validation/local_spark_singleton.py create mode 100644 python/src/databricks/labs/validation/rule.py create mode 100644 python/src/databricks/labs/validation/rule_set.py create mode 100644 python/src/databricks/labs/validation/rule_type.py create mode 100644 python/src/databricks/labs/validation/structures.py create mode 100644 python/src/databricks/labs/validation/utils/__init__.py create mode 100644 python/src/databricks/labs/validation/utils/helpers.py create mode 100644 python/tests/local_spark_singleton.py create mode 100644 python/tests/test_rule.py create mode 100644 python/tests/test_rule_set.py create mode 100644 python/tests/test_structures.py diff --git a/README.md b/README.md index 9a04923..dc5e8d1 100644 --- a/README.md +++ b/README.md @@ -328,6 +328,12 @@ evaluation specs and results The summary report is meant to be just that, a summary of the failed rules. This will return only the records that failed and only the rules that failed for that record; thus, if the `summaryReport.isEmpty` then all rules passed. + +## Python Wraper +The Python Wrapper allows users to validate data quality of their PySpark DataFrames using Python. + +They Python Wrapper can be found under the directory `/python`. A quickstart notebook is also located under `/python/examples`. + ## Next Steps Clearly, this is just a start. This is a small package and, as such, a GREAT place to start if you've never contributed to a project before. Please feel free to fork the repo and/or submit PRs. We'd love to see what diff --git a/python/.idea/.gitignore b/python/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/python/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/python/.idea/inspectionProfiles/profiles_settings.xml b/python/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/python/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/python/.idea/misc.xml b/python/.idea/misc.xml new file mode 100644 index 0000000..812ab5a --- /dev/null +++ b/python/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/python/.idea/modules.xml b/python/.idea/modules.xml new file mode 100644 index 0000000..614b3c1 --- /dev/null +++ b/python/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/python/.idea/vcs.xml b/python/.idea/vcs.xml new file mode 100644 index 0000000..6c0b863 --- /dev/null +++ b/python/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..ba84402 --- /dev/null +++ b/python/README.md @@ -0,0 +1,47 @@ +# Python Connector for the DataFrame Rules Engine +The Python Connector allows users to validate data quality of their PySpark DataFrames using Python. + +```python +validation_results = RuleSet(df) + .add(myRules) + .validate() +``` + +Currently, the Python Connector supports the following Rule types: +1. List of Values (Strings _only_) +2. Boolean Check +3. User-defined Functions (must evaluate to a Boolean) + + +### Boolean Check +Validate that an column expression evaluates to True. +```python +# Ensure that the temperature is a valid reading +valid_temp_rule = Rule("valid_temperature", F.col("temperature") > -100.0) +``` + +### List of Values (LOVs) +Validate that a Column only contains values present in a List of Strings. + +```python +# Create a List of Strings (LOS) +building_sites = ["SiteA", "SiteB", "SiteC"] + +# Build a Rule that validates that a column only contains values from LOS +building_name_rule = Rule("Building_LOV_Rule", + column=F.col("site_name"), + valid_strings=building_sites) +``` + +### User-Defined Functions (UDFs) +UDFs are great when you need to add custom business logic for validating dataset quality. +You can use User-defined Functions with the DataFrame Rules Engine that return a Boolean value. + +```python +# Create a UDF to validate date entry +def valid_date_udf(ts_column): + return ts_column.isNotNull() & F.year(ts_column).isNotNull() & F.month(ts_column).isNotNull() + +# Create a Rule that uses the UDF to validate data +valid_date_rule = Rule("valid_date_reading", valid_date_udf(F.col("reading_date"))) +``` diff --git a/python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl b/python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..f33a54a04444d299c7373b5daf51a2249840d179 GIT binary patch literal 5141 zcma)=2UOEb7KcN7bVLXsMVd52uS!$ty%<6lV~_};B$NaN6r^`hI!GsgAWfQ7A0S;y zsDkt&pn?Si1W`7;eXg3_yeNG0RR96-c*e*Gdb();d|kD zW5$~u3=Ok&M_hOE5Qjn$NCX-R6?65{6Yd=}#XZ;22aOBs_4VlMi1hRgjtGPLbxgba zM7za7mnn!VF)eo5a21d_n<4GTjw1fGwM3b z8S3E*b9aJzAdvRXa5M^u-yG9uNxDD@8jYJ-ZD&N}0Ve!(IDZld!q#E5UB^ZG(@t<$ zKI*m+)GyJh;!cg1m*Q7D{d`l&D~6BnITq6Zcznk61ZJfyUr_>1Ae)&g&7*D9t=!J| z@R@CKI8a!26%{h_Oa&j(O?w#0$JJf;Ot$Q1r#*8(uo-a?)w65(jSQjz0CBpP)2dcxGw{Wl-arNMw2tc6yME_^3jR~HxCzRs!+kXF&CuLLnPZUJ0%GC7f44DMZ5 z!xV>-DAnAH!mO2;FqCy?Hd@%bb<(WbWNaMf_j(P;V@EV`3=9FG834Pu7x~%^QJwqZ zv8d^3(pxTLIT8MbR{7mb^~uBn75yO#`Cyaf!bR7r6u&E-FJGUF`Ia8BhV^3aqT*q~ zvW`9Mw9<{iDfVvjnXRj&R$~RGLgwKNUUf{$$SYChYEzwLr}SgwAa)fNF}9V4!7GDn zF6Gj^`>hmX!ch->CBhxIKec!&g`XCZUufX$M@I~-il~v$F}>NYSMOps=4ZUr$9zRY z=IK=~iek9%T=TWuV%nVImc*dfEVf>C&^DQq!*w)_Ai1DG3(_$DJ^c&(DylcjWgLWM z<)mhYlnsi&pXfJBLV7x~oLODv#BaX4WX^+rBEF=b^zr@%h26XYit~lq*0jRpdvqn4Jn7-T%;k6Y}RtfCKp|q2zxEAAvrC7_2~vn z&-9p${?=I*WSiolP`DNu=GvEcuWR!BevLPUEPse<5di>CNdSO<2pUKC1nL1to_A@2^%yxHxZL4-QW$9z^?9%_)Y;$daY9nOa$n!O_1vZ^^zD$7 z2N|I#wVKlzv7bnIZ+lm0hozmB`U-7&_uikqj`SNKkFj8;CHie^SceL!@)cUmbHJ=E znk?%5Lh~@+H%rTsz`*kD%^;YRB)zIP2@h_>?nBgl%=;i5nhyjD;zR=j$YkDJ%GXVn z$zom9d+=%EQ%Cu!cq#Q$-gYyn@OvOitRv1kB)NP?Y&VIb#xZ~^X?I|gg```RZpk6j zxpZ_aj2mva%J`)JNz#fm>c)q)c3RGshUlJj^WqNsrFqF{Ukej+@5i?~#DgD!y4uV< zr#7nliah^xoO>fTyw+JQ2INdGbSo)O*iiP$$zqm!OZULW*-Xij`rDq-V4tsx<%`rS zdQUT0oEACK%o;rVe5^M~emx4~&;lG1AF5{XCiu@G3ySt~J&IdPx&t~QVrCTh+(TO= zqFuAN2nGaHw5Rtaw8ueSLn{3_*Y#IlSrxf)=fCK zWD20Tay~A%uL&Il65=wwPfZ_cw^Gn%+DdAiD8IVOyo@!w3*lEw(Y-u8KS+ylSn8i)aEiJQLpi@B>lHiGL~QF3iDq-aL%%tpSIWzfB_ zq!Z#KllyZwX=Eka4a9B4vd>AQ=4qwDyW?KNCveicD z6nf_Ff@;aU?R0qsN^^SM3mUYCi8O^5N>hd8&NQXs3J`U+faT6Vssd{f6lEC`@n!Lv z`(@Vt#$SYU*Do%VX|MTA=bAju-D`Day2RPBt!%>0JZ*dHRV}h=E+s+}OONr`y%2}N)nWb_$wa`3d^bgmssf}FCPQP{IcK?SPo%O} zE;t+qZs-~ZW-u;ch4~ww1M{ z>G{RO3CwUnVV_%gK|??JuYQd8ga1evk>i~LgGM+Xt7#G>K1}uE8CLjIF8n(z2e`8< z-2KSM`tM0n2VSBHzUil-8SIW`raC=Uq^5O}l5HdL?b-T=pWpjf4e93KN>gK|lXtJi zv7_k>?bvKt$ca8HD5wnW1umPhm=tC+UnDVyKn+&1>CaJh={M zAmcPI1&MdefXE(~-cnTV2nt~uT9B_-FIGM>mKv1@HFRYfuibx6-L&zfLVvjOj#{AL zuiLwF>&3-n{GFr3M{M!Gx8#O9%mofT3eX-Ng zL`7W%qLKk_MNUc3G`|(5SZABs@Jz!oQKyv&CzWap*VuR5aF(sa-rtlu{3^uqI*mxN zghdU@#nscb%M_2$m+DYvn3|Vu5;Yn-H`C!hKq0!TA3T!W17jZA3N%@tB{_xwd<)Zz=iok#7FjRNb8n?77F*Y!tydYvuU@r+eEs9s zGp197PMhM-)*Nr&uh4ItvAMQ}hCcP6wn6_8t)3`opl`4WEHNrQ?{3tuC#oybBVGya z))vtj=XVcY`M z6NLT5sYwe3{Ef^c1pq{U-vz-MYDQr7n&Ibhxe~y;3wJJ3Ph1J=wq?#%QMq&y=oU{! zpP=ZDXf5QjFKFm2PJQ#vpd6gou`5_;7^KJ{qmxooPQec!=8-AVDsH}JNbmQVjYLgN z*vkWLi*U=|Nb|TPxHmpCK7yup$lz&y38>Gt;A~R&tKlF&qUc`a$RnSKJ@VVnyf*9fiZ`t{nyB1jUb3BIb&TIkk1qpdslxk6 zMT|%LOM;vm>scd|KvAm(j{-4Frv&4w-?~MJE__Rgzh4j?PCMo zm$q!b{gBrBt6Pjy^Q2``c*zKD;yV);^-&bQ`9D|=p za$gLOGP*=8O~ih#b=t3Z%RYQzkBQ}ZFgvde>C0*MrYd?JxXFV^0h<^)u>ee^?}Ndf zv$G~86c^GB=lJ(_k!x9P!OOCUOutsnY_NGM?H3`NXV--@A+plb%4Iq+12}OY zYf*jb3#@lG1`*EK!!=@pnX}m5$E5%P^N(0jWH}#ag*=~<&|Yl}vnR^iYaMN~&gE_I zwu=}D@q<}FYmhyO{SPOT85PgZtuq6+HyMrfh=_qC|G!knd*WXW>wom|Q|0_q;ZDdn z`X%0z|7u6|_V33bp9a?J0Lf2bltFrk|G511&2>fcNL zjlA<+TM)tt)7TGqF#d0uqoex`_+LqlkW83?evtWSf1CX8d}ORghW8@?fENEU!+UAb IiKB1-0?JYHyZ`_I literal 0 HcmV?d00001 diff --git a/python/dist/dataframe_rules_engine-0.0.1.tar.gz b/python/dist/dataframe_rules_engine-0.0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..51ab2f1f48a57b167dede7f18dba52cbfa55f94c GIT binary patch literal 4420 zcmV-K5xedmiwFP!5jI`||Lr~7a@)9)dFEGOnW|ILluSvMY}cwrsqxs!oa{-QRGgW8 z@NubV5|Wruq=o=(Yq$2_w{amzfRse}q9ilyhnNBz-9V#nxG}y<-D%=3@FYn?OeQ#* z1ra{zIX$P}C5hMd0pTVIysxA?nFLWlClhD6da(=i0FH-4{ujXf+dJ&{U-SpZ{i9*O zH)Q39y?%fA0`*?(0a8NU1e)^V?+Mc+ULYrQZ-{g2h5?Lr5JwKv1r#inaYE52c<%6j zKd+Y9T3=y8FfCTP#B>cdp-Jk|G{Gdp|H*pMz_$Q@2m5Ek_z)+s&HmZ5ZvXw>px@a4 zcRBvK8P`^Y-N%2>KOQvW|GkfY7<+CwAxk&;Iw3(c3o!-fURH*!@qaKh#{U=+fM)#n z7XOzkK8OzaGt2_=B`dhZgup;smgR*mA+j(V>5*k2fM58#1*Y@Z=LJ5Vq8X;(96+>@ zhg)7q+9UpoL4qOjM;Zg?CJjO#Ck|BmkenshrMT5@QiAW?{#lJWX}}iyKR)c4_J26+ zoiz5p50K_AJ%Gs9>1qi!o_T_I0il+7k?TS!rLT+yx--d2x60E^!da2 zhY@-c+#}wjc@R-FjT6L~Poi{zp+!VF&Mc>D5gblC%4sU#vWdW6M+iDwf#Ahqx`;+a zpYELTvI^uICLK2kFd0cnp)tCMBV2(i8jSCkNfq&J^`NmZ{=kN*(-r)>-c}{+?H{S0Ta3-v>0CNYU^k*t!>otGAW1$ z<83Zyz{r0XSh<7&EB{f6CFQ9QW#Dk z{Zf*Fv{Z-2lP~VAJ8_f5UFmc>#leL?UR$FnUVh;n8tbTEZWoRhFPshTgH?0fS#T5?kbCTj+`}UeSX%)di--JE# z5MG}66uJqzDNF*wiK?H6hm8aZQAAbM$7o}9BM=xQq(iAeffP+TC&1!XjoMg~oH-md z1eoFf3h3C43bb;kRfi16K6A5J$0-y%T^$k}{XN6ZX(&ysL4e_20IDVrcUDJ@0&+T= z6d2eeIB{~+dn=zHrIa-PY#>; z|F>NKJBML^lWTfh$j+kze#Y+~quEZs$*-f-(~I%>K;&Db^0dOGJ_(9C5oqg!R{-ZuHI2 zsoi}!+w;{^0k)KDRQpmIQovnWmGWf8yA!cBe2F zg&|HC3pZKSq$#mClgx(`sAl~u(P|mp=@?CEn_@1LwY~%Y&w~Dz0I+TUXD~Q2{r_;- ztpDx>3ci%hX{0MH&TK$;ZI-9o+8ec6X!D5((cfO=({P(BOje+yU8)5ffhB+INMMin zbFjb)D=W~^&bT$*Gb~|P_jZ$5KCk`f!Fsa;*kb>K!~UUZ|Hns-{r_e6f3gHl_>lG4 zTNvNrq=VAs5?AeX@Yio6FHA{ri^uC${&{r+&CHrODR3S9ul|ZxhgbY>MJ+`+tF8(v z=oFFlf+yI5_@aFBu4`vv_vig)+m3ZhN;uZXRFy+TV)a$F>b(`(z z2N5+k>=`M!p-McFB(X9`jiQvucW)MfSPS_sB~a*}8Q62w zCZ7NJ{-0t(N$q3)58nSf8JhmTf7qP={kHpm+WG$?l7?^=M^j4{#ANL~0eK?!6y+;J z0`YDhcysAH2x2F`E(H&{#*rPPov2k0IbnR*)dL_WE*BuzVB{#Ak!Z~bof;@bDupen zXd2A2g&HP~R@)(%o`zxTs$~nY`blsL;Q`;%#64pz{^-)VU3Y;aCJv5ngCvfw>`%YH zdwn^1eev7di`SF?I{jo{x35e4M6If;6(welXnKqN8H?w38OdEP*>RATt>&bBtAL=_ z+)vmQRkS-C?JWZ(e9M~Q`l}0f;S!3Imd%B5_0zTeECcXp|IhxN)V=GmyZ`U^Pfm>c z-~D5T-uVCht^cPmg=gbz>H7gJB2*MP(=_tf6p$d7pf_@N0A`EMmRlXg3mvs9ZMX`; z74$)S51*kXqt(iK?ywSe0q~9$a5ofgJo2r0T{UtjxtHGN5hrFVpHL?{_BXO3C^hU`gZ}H z!?Qg3>4xathd(l+9(NkPBEa1nYjTu12zmUqW&fJJg6b1iWM_wM$8Xr$EV$=@igjDN zUB?@1$4SRm_9aw51)}`n*X-d8obuWZh9Pn(5V%P}+{lG;_C*3E96s%lbG3X)7?c5n znv}bmFG9E4Xgo%hUAFmYWxi4^&#g0-U=;X%2~4r?*fltn4{QpKxd?&NTYjZNE_*+g zV}7cWb6R&&AJ~4LW1%?03Tv`tNL3v*Px!Wr9LPLQLtoToXa7-#%kS%8C}8IhWa!<2 zB|AjBnGs+lcqUwqZCI_=U}(e^+uhjejPtGr?8?kJ8~%7IZ{0`^8=XQSpNc~!99B!W z=C@fN&m@XeZ)wWKigx?K25630sXZl=Ti31!m|dCKI;+!q`zXU()4>gP#}3>)*yt*{ z_{{cJ$?-wvRu!)*>_;kkNr=`?HMS@0CIQK9bm96CpfJ6| zILg_w8+|$P8lefqk}P7jBT*|j)krn-t4HHjtJ=;Fs_h)T8lw}Z_XGyNg!GEGa;q)O zM4T7bg%g>vr`x&k({ItLgQuraiv|>f9u@!d%*tR z;n2MQ(LXtAzW@1c)_-xShDwARmgFRu_mv>*T zG{@QM_j-LW{r`z0JaNIV9)TqqX)VJ!%y4c1=kUlGR^SXW91n2#Ia`I2pWNWr7z_SE z%&GHRaK+ZP9U7m%Y1ux$4Lm#vd^_VH9$B7@6PKM~E@ZT7uaPds_c9-T^-)n>8=tce*syz=knYH73)JtJr7<36R#joekw<+#sX+*xmpRmE`Cvb>*{QflRT@7A5GnW%^$g% zSKWWi>J@~gZg#eviu1zl%4|R*=G?(F24|U=Xn`;^Lotdn?>{0&{>{(-WP4Y&?s`2W z{vREml)nGnod5lfeQytEyP{$O7 z(2v6R5_7REf@rdE?UZCp)K6bm-PfoB1t(QDZBM}c-l8^5xzx9oHC}-hS#B+&1&rH!94o;fnE zj@O~_B783U?;rO~`#@JQRayFP1u4R4zH~x)OKf*xPG@7I9OLw*{&`5moZ!JtDra|#WbXIGBS zGhV36pV@kKVada`+!Xv|k}cT(FQ0de?Q=H@aELy~3qD)Kkc)(4XgDinR)fheg@qJI zPi*vmokIQp4#E)q79`Nvy!ss!Lhj89t7>0cU-9ZLPJAIr-u`S4m`_>rFIw1aNL^ou zQ-yO|jq`!XdDz=m&tk@;jLEMTEcqK<7B=LT#7^V#?N&<;x8B17F)_WG}fQ z^GcWKZItzf+DvRp#hf$=$+h)Q7@s9;1Jh?pmt@rK!jxy8-Z*+9$@9o{+W3}`l*{VD zG`ayN@hv`Bw$cB8{MT;>Z!g|_cqET){(orx|BwCQVXwLWzZZB9_QZ7H;1A(bN9fS$ zTNnJ}K-J1>DNL>;)}`2$8KI|ly4FWwt_Qz=dN)FuwRX@$?M$a9?DpZ*q}5udJTM$^ z-#tRbfPjA95C~p%P{H}5pVvG6SJuCzzaI&)cDVF9xtq7nlv5s|I*wDTu-$uV=$pOx z4ye-MTc5IEB#K9fhoZdrJRtcOlC1MQ)E}I&C5nh1@K0AFwp!5zUHt0Dcadc^Q=0}F zXrO@x8fc(_1{!Fffd(3Apn(P&XrO@x8fc(_1{!Fffd(3Apn(P&XrO@x8ff4z3I7jV K1Nxo-pa1|%Ub|KR literal 0 HcmV?d00001 diff --git a/python/examples/01-Generate Mock Purchase Transactions.py b/python/examples/01-Generate Mock Purchase Transactions.py new file mode 100644 index 0000000..76797d8 --- /dev/null +++ b/python/examples/01-Generate Mock Purchase Transactions.py @@ -0,0 +1,60 @@ +# Databricks notebook source +catalog_name = "REPLACE_ME" +schema_name = "REPLACE_ME" + +# COMMAND ---------- + +import random +import datetime + +def generate_sample_data(): + """Generates mock transaction data that randomly adds bad data""" + + # randomly generate bad data + if bool(random.getrandbits(1)): + appl_id = None + acct_no = None + event_ts = None + cstone_last_updatetm = None + else: + appl_id = random.randint(1000000, 9999999) + acct_no = random.randint(1000000000000000, 9999999999999999) + event_ts = datetime.datetime.now() + cstone_last_updatetm = datetime.datetime.now() + + # randomly generate an MCC description + categories = ["dining", "transportation", "merchendise", "hotels", "airfare", "grocery stores/supermarkets/bakeries"] + random_index = random.randint(0, len(categories)-1) + category = categories[random_index] + + # randomly generate a transaction price + price = round(random.uniform(1.99, 9999.99), 2) + + data = [ + (appl_id, acct_no, event_ts, category, price, cstone_last_updatetm) + ] + df = spark.createDataFrame(data, + "appl_id int, acct_no long, event_ts timestamp, category string, price float, cstone_last_updatetm timestamp") + return df + +# COMMAND ---------- + +spark.sql(f"create schema if not exists {catalog_name}.{schema_name}") + +# COMMAND ---------- + +spark.sql(f""" +CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.purchase_transactions_bronze +(appl_id int, acct_no long, event_ts timestamp, category string, price float, cstone_last_updatetm timestamp) +USING DELTA +TBLPROPERTIES (delta.enableChangeDataFeed = true) +""") + +# COMMAND ---------- + +df = generate_sample_data() +df.write.insertInto(f"{catalog_name}.{schema_name}.purchase_transactions_bronze") + +# COMMAND ---------- + + diff --git a/python/examples/02-Apply Purchase Transaction Rules.py b/python/examples/02-Apply Purchase Transaction Rules.py new file mode 100644 index 0000000..a101794 --- /dev/null +++ b/python/examples/02-Apply Purchase Transaction Rules.py @@ -0,0 +1,152 @@ +# Databricks notebook source +# MAGIC %run ./PythonWrapper + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Ingest new Data + +# COMMAND ---------- + +import datetime + +starting_time = datetime.datetime.now() - datetime.timedelta(minutes=5) + +catalog_name = "REPLACE_ME" +schema_name = "REPLACE_ME" + +# COMMAND ---------- + +# Read table changes from 5 mins ago +df = spark.read.format("delta") \ + .option("readChangeFeed", "true") \ + .option("startingTimestamp", starting_time) \ + .table(f"{catalog_name}.{schema_name}.purchase_transactions_bronze") +purchase_transactions_df = df.select("appl_id", "acct_no", "event_ts", "category", "price", "cstone_last_updatetm")\ + .where("_change_type='insert'") +purchase_transactions_df.display() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Define Rules using Builder Pattern + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Sample Rules +# MAGIC +# MAGIC From a DQ rule point of view, we would be looking at following scenarios: +# MAGIC +# MAGIC - **event_ts**: Should have a timestamp for every day (timestamp format doesn’t matter) +# MAGIC - **cstone_last_updatetm**: Should have a timestamp for every day +# MAGIC - **acct_no**: No null values for this column +# MAGIC - **appl_id**: No null values for this column +# MAGIC - **Changes in string length** - for all columns +# MAGIC + +# COMMAND ---------- + +import pyspark.sql.functions as F + +# First, begin by defining your RuleSet by passing in your input DataFrame +myRuleSet = RuleSet(purchase_transactions_df) + +# Rule 1 - define a Rule that validates that the `acct_no` is never null +acct_num_rule = Rule("valid_acct_no_rule", F.col("acct_no").isNotNull()) +myRuleSet.add(acct_num_rule) + +# Rule 2 - add a Rule that validates that the `appl_id` is never null +appl_id_rule = Rule("valid_appl_id", F.col("appl_id").isNotNull()) +myRuleSet.add(appl_id_rule) + +# COMMAND ---------- + +# Rules can even be used in conjunction with User-Defined Functions +def valid_timestamp(ts_column): + return ts_column.isNotNull() & F.year(ts_column).isNotNull() & F.month(ts_column).isNotNull() + +# COMMAND ---------- + +# Rule 3 - enforce a valid `event_ts` timestamp +valid_event_ts_rule = Rule("valid_event_ts_rule", valid_timestamp(F.col("event_ts"))) +myRuleSet.add(valid_event_ts_rule) + +# Rule 4 - enforce a valid `cstone_last_updatetm` timestamp +valid_cstone_last_updatetm_rule = Rule("valid_cstone_last_updatetm_rule", valid_timestamp(F.col("cstone_last_updatetm"))) +myRuleSet.add(valid_cstone_last_updatetm_rule) + +# COMMAND ---------- + +# Rule 5 - validate string lengths +import pyspark.sql.functions as F +import datetime + +starting_timestamp = datetime.datetime.now() - datetime.timedelta(minutes=5) +ending_timestamp = datetime.datetime.now() - datetime.timedelta(minutes=1) + +# Read table changes from 5 mins ago +df = spark.read.format("delta") \ + .option("readChangeFeed", "true") \ + .option("startingVersion", 0) \ + .option("endingVersion", 10) \ + .table(f"{catalog_name}.{schema_name}.purchase_transactions_bronze") +df_category = df.select("category").where("_change_type='insert'").agg(F.mean(F.length(F.col("category"))).alias("avg_category_len")) +avg_category_len = df_category.collect()[0]['avg_category_len'] +print(avg_category_len) + +# COMMAND ---------- + +def valid_category_len(category_column, avg_category_str_len): + return F.length(category_column) <= avg_category_str_len + +# Rule 5 - validate `category` string lengths +valid_str_length_rule = Rule("valid_category_str_length_rule", valid_category_len(F.col("category"), avg_category_len)) +myRuleSet.add(valid_str_length_rule) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Validate Rows + +# COMMAND ---------- + +from pyspark.sql import DataFrame + +# Finally, add the Rule to the RuleSet and validate! +summaryReport = myRuleSet.get_summary_report() +completeReport = myRuleSet.get_complete_report() + +# Display the summary validation report +display(summaryReport) + +# COMMAND ---------- + +# Display the complete validation report +display(completeReport) + +# COMMAND ---------- + +spark.sql(f""" + CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.purchase_transactions_validated + (appl_id int, acct_no long, event_ts timestamp, category string, price float, cstone_last_updatetm timestamp, failed_rules array) + USING DELTA + TBLPROPERTIES (delta.enableChangeDataFeed = true) +""") + +# COMMAND ---------- + +import pyspark.sql.functions as F +import pyspark.sql.types as T + +if summaryReport.count() > 0: + summaryReport.write.insertInto(f"{catalog_name}.{schema_name}.purchase_transactions_validated") +else: + string_array_type = T.ArrayType(T.StringType()) + purchase_transactions_df \ + .withColumn("failed_rules", F.array(F.array().cast(string_array_type))) \ + .write.insertInto(f"{catalog_name}.{schema_name}.purchase_transactions_validated") + +# COMMAND ---------- + + diff --git a/python/examples/PythonWrapper.py b/python/examples/PythonWrapper.py new file mode 100644 index 0000000..1bb96e1 --- /dev/null +++ b/python/examples/PythonWrapper.py @@ -0,0 +1,212 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Spark Singleton + +# COMMAND ---------- + +import pyspark +from pyspark.sql import SparkSession, DataFrame +from typing import List + + +class SparkSingleton: + """A singleton class which returns one Spark instance""" + __instance = None + + @classmethod + def get_instance(cls): + """Create a Spark instance. + :return: A Spark instance + """ + return (SparkSession.builder + .appName("DataFrame Rules Engine") + .getOrCreate()) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Rule Types + +# COMMAND ---------- + +class RuleType: + + ValidateExpr = "expr" + ValidateBounds = "bounds" + ValidateNumerics = "validNumerics" + ValidateStrings = "validStrings" + ValidateDateTime = "validDateTime" + ValidateComplex = "complex" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Structures class + +# COMMAND ---------- + +class Bounds: + + def __init__(self, lower, upper, + lowerInclusive = False, + upperInclusive = False): + self.lower = lower + self.upper = upper + self.lowerInclusive = lowerInclusive + self.upperInclusive = upperInclusive + self._spark = SparkSingleton.get_instance() + self._jBounds = self._spark._jvm.com.databricks.labs.validation.utils.Structures.Bounds(lower, upper, lowerInclusive, upperInclusive) + + def validationLogic(self, col): + jCol = col._jc + return self._spark._jvm.com.databricks.labs.validation.utils.Structures.Bounds.validationLogic(jCol) + + +class MinMaxRuleDef: + + def __init__(self, + rule_name: str, + column: pyspark.sql.Column, + bounds: Bounds, + by: List[pyspark.sql.Column] = None): + self.rule_name = rule_name + self.column = column + self.bounds = bounds + self.by = by + + +class ValidationResults: + + def __init__(self, + complete_report: pyspark.sql.DataFrame, + summary_report: pyspark.sql.DataFrame): + self.complete_report = complete_report + self.summary_report = summary_report + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Rule class + +# COMMAND ---------- + +class Rule: + """ + Definition of a rule + """ + def __init__(self, + rule_name: str, + column: pyspark.sql.Column, + boundaries: Bounds = None, + valid_expr: pyspark.sql.Column = None, + valid_strings: List[str] = None, + valid_numerics = None, + ignore_case: bool = False, + invert_match: bool = False): + + self._spark = SparkSingleton.get_instance() + self._column = column + self._boundaries = boundaries + self._valid_expr = valid_expr + self._valid_strings = valid_strings + self._valid_numerics = valid_numerics + self._is_implicit_bool = False + + # Determine the Rule type by parsing the input arguments + if valid_strings is not None and len(valid_strings) > 0: + j_valid_strings = Helpers.to_java_array(valid_strings, self._spark._sc) + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, column._jc, + j_valid_strings, + ignore_case, invert_match) + self._rule_type = RuleType.ValidateStrings + + elif valid_numerics is not None and len(valid_numerics) > 0: + j_valid_numerics = Helpers.to_java_array(valid_numerics, self._spark._sc) + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, + column._jc, + j_valid_numerics) + self._rule_type = RuleType.ValidateNumerics + else: + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, column._jc) + self._is_implicit_bool = True + self._rule_type = RuleType.ValidateExpr + + def to_string(self): + return self._jRule.toString() + + def boundaries(self): + return self._boundaries + + def valid_numerics(self): + return self._valid_numerics + + def valid_strings(self): + return self._valid_strings + + def valid_expr(self): + return self._valid_expr + + def is_implicit_bool(self): + return self._jRule.implicitBoolean + + def ignore_case(self): + return self._jRule.ignoreCase + + def invert_match(self): + return self._jRule.invertMatch + + def rule_name(self): + return self._jRule.ruleName + + def is_agg(self): + return self._jRule.isAgg + + def input_column_name(self): + return self._jRule.inputColumnName + + def rule_type(self): + return self._rule_type + + def to_java(self): + return self._jRule + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # RuleSet class + +# COMMAND ---------- + +class RuleSet(): + + def __init__(self, df): + self.spark = SparkSingleton.get_instance() + self._df = df + self._jdf = df._jdf + self._jRuleSet = self.spark._jvm.com.databricks.labs.validation.RuleSet.apply(self._jdf) + + def add(self, rule): + self._jRuleSet.add(rule.to_java()) + + def get_df(self): + return self._df + + def to_java(self): + return self._jRuleSet + + def validate(self): + validation_results = self._jRuleSet.validate(1) + return validation_results + + def get_complete_report(self): + jCompleteReport = self._jRuleSet.getCompleteReport() + return DataFrame(jCompleteReport, self.spark._sc) + + def get_summary_report(self): + jSummaryReport = self._jRuleSet.getSummaryReport() + return DataFrame(jSummaryReport, self.spark._sc) + +# COMMAND ---------- + + diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..31df80d --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/databricks"] + +[project] +name = "dataframe-rules-engine" +version = "0.0.1" +description = "An extensible Rules Engine for custom Apache Spark Dataframe / Dataset validation." +authors = [ + { name="Daniel Tomes", email="daniel.tomes@databricks.com" }, + { name="Will Girten", email="will.girten@databricks.com" }, +] +keywords = ["Spark", "Rules", "Validation"] +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "pyspark" +] +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3.9", + "License :: Other/Proprietary License", + "Operating System :: OS Independent", +] + +[project.urls] +Homepage = "https://github.com/databrickslabs/dataframe-rules-engine" +Issues = "https://github.com/databrickslabs/dataframe-rules-engine/issues" diff --git a/python/src/databricks/__init__.py b/python/src/databricks/__init__.py new file mode 100644 index 0000000..a2d7d14 --- /dev/null +++ b/python/src/databricks/__init__.py @@ -0,0 +1,3 @@ +from .labs.validation.rule import Rule, RuleType +from .labs.validation.rule_set import RuleSet +from .labs.validation.structures import * diff --git a/python/src/databricks/labs/__init__.py b/python/src/databricks/labs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/databricks/labs/validation/__init__.py b/python/src/databricks/labs/validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/databricks/labs/validation/local_spark_singleton.py b/python/src/databricks/labs/validation/local_spark_singleton.py new file mode 100644 index 0000000..1cb38c9 --- /dev/null +++ b/python/src/databricks/labs/validation/local_spark_singleton.py @@ -0,0 +1,8 @@ +from pyspark.sql import SparkSession + + +class SparkSingleton: + + @classmethod + def get_instance(cls): + return SparkSession.builder.getOrCreate() diff --git a/python/src/databricks/labs/validation/rule.py b/python/src/databricks/labs/validation/rule.py new file mode 100644 index 0000000..ce8e22d --- /dev/null +++ b/python/src/databricks/labs/validation/rule.py @@ -0,0 +1,91 @@ +import pyspark +from typing import List + +from databricks.labs.validation.local_spark_singleton import SparkSingleton +from databricks.labs.validation.rule_type import RuleType +from databricks.labs.validation.structures import Bounds +from databricks.labs.validation.utils.helpers import Helpers + + +class Rule: + """ + Definition of a rule + """ + # TODO: Fix type hint for valid_numerics + def __init__(self, + rule_name: str, + column: pyspark.sql.Column, + boundaries: Bounds = None, + valid_expr: pyspark.sql.Column = None, + valid_strings: List[str] = None, + valid_numerics = None, + ignore_case: bool = False, + invert_match: bool = False): + + self._spark = SparkSingleton.get_instance() + self._column = column + self._boundaries = boundaries + self._valid_expr = valid_expr + self._valid_strings = valid_strings + self._valid_numerics = valid_numerics + self._is_implicit_bool = False + self._rule_name = rule_name + + # Determine the Rule type by parsing the input arguments + if valid_strings is not None and len(valid_strings) > 0: + j_valid_strings = Helpers.to_java_array(valid_strings, self._spark._sc) + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, column._jc, + j_valid_strings, + ignore_case, invert_match) + self._rule_type = RuleType.ValidateStrings + + elif valid_numerics is not None and len(valid_numerics) > 0: + j_valid_numerics = Helpers.to_java_array(valid_numerics, self._spark._sc) + print(j_valid_numerics) + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, + column._jc, + j_valid_numerics) + self._rule_type = RuleType.ValidateNumerics + else: + self._jRule = self._spark._jvm.com.databricks.labs.validation.Rule.apply(rule_name, column._jc) + self._is_implicit_bool = True + self._rule_type = RuleType.ValidateExpr + + def to_string(self): + return self._jRule.toString() + + def boundaries(self): + return self._boundaries + + def valid_numerics(self): + return self._valid_numerics + + def valid_strings(self): + return self._valid_strings + + def valid_expr(self): + return self._valid_expr + + def is_implicit_bool(self): + return self._jRule.implicitBoolean + + def ignore_case(self): + return bool(self._jRule.ignoreCase) + + def invert_match(self): + return bool(self._jRule.invertMatch) + + def rule_name(self): + return self._rule_name + + def is_agg(self): + return self._jRule.isAgg + + def input_column_name(self): + return self._jRule.inputColumnName.toString() + + def rule_type(self): + return self._rule_type + + def to_java(self): + return self._jRule diff --git a/python/src/databricks/labs/validation/rule_set.py b/python/src/databricks/labs/validation/rule_set.py new file mode 100644 index 0000000..e5c7f0c --- /dev/null +++ b/python/src/databricks/labs/validation/rule_set.py @@ -0,0 +1,32 @@ +from pyspark.sql import DataFrame + +from databricks.labs.validation.local_spark_singleton import SparkSingleton +from databricks.labs.validation.structures import ValidationResults, MinMaxRuleDef + + +class RuleSet(): + + def __init__(self, df): + self.spark = SparkSingleton.get_instance() + self._df = df + self._jdf = df._jdf + self._jRuleSet = self.spark._jvm.com.databricks.labs.validation.RuleSet.apply(self._jdf) + + def add(self, rule): + self._jRuleSet.add(rule.to_java()) + + def addMinMaxRule(self, minMaxRuleDef): + pass + + def get_df(self): + return self._df + + def to_java(self): + return self._jRuleSet + + def validate(self): + jValidationResults = self._jRuleSet.validate(1) + complete_report = DataFrame(jValidationResults.completeReport(), self.spark) + summary_report = DataFrame(jValidationResults.summaryReport(), self.spark) + validation_results = ValidationResults(complete_report, summary_report) + return validation_results diff --git a/python/src/databricks/labs/validation/rule_type.py b/python/src/databricks/labs/validation/rule_type.py new file mode 100644 index 0000000..7642c12 --- /dev/null +++ b/python/src/databricks/labs/validation/rule_type.py @@ -0,0 +1,9 @@ + +class RuleType: + + ValidateExpr = "expr" + ValidateBounds = "bounds" + ValidateNumerics = "validNumerics" + ValidateStrings = "validStrings" + ValidateDateTime = "validDateTime" + ValidateComplex = "complex" diff --git a/python/src/databricks/labs/validation/structures.py b/python/src/databricks/labs/validation/structures.py new file mode 100644 index 0000000..191208e --- /dev/null +++ b/python/src/databricks/labs/validation/structures.py @@ -0,0 +1,51 @@ +import pyspark +from typing import List + +from databricks.labs.validation.local_spark_singleton import SparkSingleton + + +class Bounds: + + def __init__(self, lower, upper, + lowerInclusive=False, + upperInclusive=False): + self.lower = lower + self.upper = upper + self.lowerInclusive = lowerInclusive + self.upperInclusive = upperInclusive + self._spark = SparkSingleton.get_instance() + self._jBounds = self._spark._jvm.com.databricks.labs.validation.utils.Structures.Bounds(lower, upper, + lowerInclusive, + upperInclusive) + + def validationLogic(self, col): + jCol = col._jc + return self._spark._jvm.com.databricks.labs.validation.utils.Structures.Bounds.validationLogic(jCol) + + +class MinMaxRuleDef: + + def __init__(self, + rule_name: str, + column: pyspark.sql.Column, + bounds: Bounds, + by: List[pyspark.sql.Column] = None): + self.rule_name = rule_name + self.column = column + self.bounds = bounds + self.by = by + + +class ValidationResults: + + def __init__(self, + complete_report: pyspark.sql.DataFrame, + summary_report: pyspark.sql.DataFrame): + self.complete_report = complete_report + self.summary_report = summary_report + + def get_complete_report(self): + return self.complete_report + + def get_summary_report(self): + return self.summary_report diff --git a/python/src/databricks/labs/validation/utils/__init__.py b/python/src/databricks/labs/validation/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/databricks/labs/validation/utils/helpers.py b/python/src/databricks/labs/validation/utils/helpers.py new file mode 100644 index 0000000..c7fc646 --- /dev/null +++ b/python/src/databricks/labs/validation/utils/helpers.py @@ -0,0 +1,12 @@ +class Helpers: + + @staticmethod + def to_java_array(py_array, sc): + if isinstance(py_array[0], str): + java_string_class = sc._jvm.java.lang.String + java_array = sc._gateway.new_array(java_string_class, len(py_array)) + for i in range(len(py_array)): + java_array[i] = py_array[i] + else: + raise Exception("Only List of Strings is currently supported.") + return java_array diff --git a/python/tests/local_spark_singleton.py b/python/tests/local_spark_singleton.py new file mode 100644 index 0000000..4864612 --- /dev/null +++ b/python/tests/local_spark_singleton.py @@ -0,0 +1,34 @@ +from pyspark.sql import SparkSession +from pyspark import SparkConf +import os + + +class SparkSingleton: + """A singleton class which returns one Spark instance""" + __instance = None + + @classmethod + def get_instance(cls): + """Create a Spark instance. + :return: A Spark instance + """ + config = SparkConf().setAll([("spark.driver.extraClassPath", + os.environ["RULES_ENGINE_JAR"])]) + spark = (SparkSession.builder + .config(conf=config) + .appName("DataFrame Rules Engine") + .getOrCreate()) + spark.sparkContext.setLogLevel("ERROR") + return spark + + @classmethod + def get_local_instance(cls): + config = SparkConf().setAll([("spark.driver.extraClassPath", + os.environ["RULES_ENGINE_JAR"])]) + spark = (SparkSession.builder + .config(conf=config) + .master("local[*]") + .appName("DataFrame Rules Engine") + .getOrCreate()) + spark.sparkContext.setLogLevel("ERROR") + return spark diff --git a/python/tests/test_rule.py b/python/tests/test_rule.py new file mode 100644 index 0000000..e63fb0b --- /dev/null +++ b/python/tests/test_rule.py @@ -0,0 +1,30 @@ +import unittest +import pyspark.sql +import pyspark.sql.functions as F + +from src.databricks.labs.validation.rule import Rule +from src.databricks.labs.validation.rule_type import RuleType +from tests.local_spark_singleton import SparkSingleton + + +class TestRule(unittest.TestCase): + + def setUp(self): + self.spark = SparkSingleton.get_instance() + + + def test_string_lov_rule(self): + """Tests that a list of String values rule can be instantiated correctly.""" + + # Ensure that a rule with a list of valid strings can be validated + building_sites = ["SiteA", "SiteB", "SiteC"] + building_name_rule = Rule("Building_LOV_Rule", column=F.col("site_name"), + valid_strings=building_sites) + + # Ensure that all attributes are set correctly for Integers + assert building_name_rule.rule_name() == "Building_LOV_Rule", "Rule name is not set as expected." + assert building_name_rule.rule_type() == RuleType.ValidateStrings, "The rule type is not set as expected." + assert not building_name_rule.ignore_case() + + def tearDown(self): + self.spark.stop() diff --git a/python/tests/test_rule_set.py b/python/tests/test_rule_set.py new file mode 100644 index 0000000..0b328cc --- /dev/null +++ b/python/tests/test_rule_set.py @@ -0,0 +1,145 @@ +import unittest + +from src.databricks import RuleSet, Rule +from tests.local_spark_singleton import SparkSingleton + +import pyspark.sql.functions as F + + +def valid_date_udf(ts_column): + return ts_column.isNotNull() & F.year(ts_column).isNotNull() & F.month(ts_column).isNotNull() + + +class TestRuleSet(unittest.TestCase): + + def setUp(self): + self.spark = SparkSingleton.get_instance() + + def test_create_ruleset_from_dataframe(self): + test_data = [ + (1.0, 2.0, 3.0), + (4.0, 5.0, 6.0), + (7.0, 8.0, 9.0) + ] + test_df = self.spark.createDataFrame(test_data, schema="retail_price float, scan_price float, cost float") + test_rule_set = RuleSet(test_df) + + # Ensure that the RuleSet DataFrame is set properly + assert test_rule_set.get_df().exceptAll(test_df).count() == 0 + + def test_list_of_strings(self): + iot_readings = [ + (1001, "zone_a", 50.1), + (1002, "zone_b", 25.4), + (1003, "zone_c", None) + ] + valid_zones = ["zone_a", "zone_b", "zone_c", "zone_d"] + df = self.spark.createDataFrame(iot_readings).toDF("device_id", "zone_id", "temperature") + rule_set = RuleSet(df) + + # Add a list of strings + valid_zones_rule = Rule("valid_zones", F.col("zone_id"), valid_strings=valid_zones) + rule_set.add(valid_zones_rule) + + # Ensure that the summary report contains no failed rules + validation_summary = rule_set.validate().get_summary_report() + assert validation_summary.where(F.col("failed_rules").isNotNull()).count() == 0 + + # Add a row that _should_ fail + new_iot_reading = [ + (1004, "zone_z", 30.1) + ] + new_reading_df = self.spark.createDataFrame(new_iot_reading).toDF("device_id", "zone_id", "temperature") + combined_df = df.union(new_reading_df) + new_rule_set = RuleSet(combined_df) + new_rule_set.add(valid_zones_rule) + new_validation_summary = new_rule_set.validate().get_summary_report() + + # Ensure that the added reading should fail due to an invalid zone id string + assert new_validation_summary.where(F.col("failed_rules").isNotNull()).count() == 1 + + def test_list_of_numerics(self): + iot_readings = [ + (1001, "zone_a", 50.1), + (1002, "zone_b", 25.4), + (1003, "zone_c", None) + ] + valid_device_ids = [1001, 1002, 1003, 1004, 1005] + df = self.spark.createDataFrame(iot_readings).toDF("device_id", "zone_id", "temperature") + rule_set = RuleSet(df) + + # Add a list of numerical values + valid_device_ids_rule = Rule("valid_device_id", F.col("device_id"), valid_numerics=valid_device_ids) + rule_set.add(valid_device_ids_rule) + + # Ensure that the summary report contains no failed rules + validation_summary = rule_set.validate().get_summary_report() + assert validation_summary.where(F.col("failed_rules").isNotNull()).count() == 0 + + def test_boolean_rules(self): + iot_readings = [ + (1001, "zone_a", 50.1), + (1002, "zone_b", 25.4), + (1003, "zone_c", None) + ] + df = self.spark.createDataFrame(iot_readings).toDF("device_id", "zone_id", "temperature") + rule_set = RuleSet(df) + + # Add a rule that `device_id` is not null + not_null_rule = Rule("valid_device_id", F.col("device_id").isNotNull()) + rule_set.add(not_null_rule) + + # Add a rule that `temperature` is > -100.0 degrees + valid_temp_rule = Rule("valid_temp", F.col("temperature") > -100.0) + rule_set.add(valid_temp_rule) + + validation_summary = rule_set.validate().get_summary_report() + assert validation_summary.where(F.col("failed_rules").isNotNull()).count() == 0 + + def test_udf_rules(self): + iot_readings = [ + (1001, "zone_a", 50.1, "2024-04-25"), + (1002, "zone_b", 25.4, "2024-04-24"), + (1003, "zone_c", None, "2024-04-24") + ] + df = self.spark.createDataFrame(iot_readings).toDF("device_id", "zone_id", "temperature", "reading_date_str") + df = df.withColumn("reading_date", F.col("reading_date_str").cast("date")).drop("reading_date_str") + rule_set = RuleSet(df) + + # Ensure that UDFs can be used to validate data quality + valid_reading_date_rule = Rule("valid_reading_date", valid_date_udf(F.col("reading_date"))) + rule_set.add(valid_reading_date_rule) + + validation_summary = rule_set.validate().get_summary_report() + assert validation_summary.where(F.col("failed_rules").isNotNull()).count() == 0 + + def test_add_rules(self): + iot_readings = [ + (1001, "zone_a", 50.1), + (1002, "zone_b", 25.4), + (1003, "zone_c", None) + ] + df = self.spark.createDataFrame(iot_readings).toDF("device_id", "zone_id", "temperature") + rule_set = RuleSet(df) + + # Test boolean rule + temp_rule = Rule("valid_temp", F.col("temperature").isNotNull()) + rule_set.add(temp_rule) + + # Ensure that the RuleSet DF can be set/gotten correctly + rule_set_df = rule_set.get_df() + assert rule_set_df.count() == 3 + assert "device_id" in rule_set_df.columns + assert "zone_id" in rule_set_df.columns + assert "temperature" in rule_set_df.columns + + # Add a list of strings + valid_zones_rule = Rule("valid_zones", F.col("zone_id"), valid_strings=["zone_a", "zone_b", "zone_c"]) + rule_set.add(valid_zones_rule) + + # Ensure that the summary report contains failed rules + validation_summary = rule_set.validate().get_summary_report() + assert validation_summary.where(F.col("failed_rules").isNotNull()).count() == 1 + + def tearDown(self): + self.spark.stop() diff --git a/python/tests/test_structures.py b/python/tests/test_structures.py new file mode 100644 index 0000000..6879ff9 --- /dev/null +++ b/python/tests/test_structures.py @@ -0,0 +1,39 @@ +import unittest + +from src.databricks.labs.validation.structures import MinMaxRuleDef, Bounds +from tests.local_spark_singleton import SparkSingleton + +import pyspark.sql.functions as F + + +class TestStructures(unittest.TestCase): + + def setUp(self): + self.spark = SparkSingleton.get_instance() + + def test_get_returns(self): + + # Test Bounds + sku_price_bounds = Bounds(1.0, 1000.0) + assert sku_price_bounds.lower == 1.0 + assert sku_price_bounds.upper == 1000.0 + assert not sku_price_bounds.lowerInclusive + assert not sku_price_bounds.upperInclusive + sku_price_bounds_inclusive = Bounds(1.0, 1000.0, lowerInclusive=True, upperInclusive=True) + assert sku_price_bounds_inclusive.lowerInclusive + assert sku_price_bounds_inclusive.upperInclusive + + # Test MinMax Definitions + min_max_no_agg = MinMaxRuleDef("valid_sku_prices", F.col("sku_price"), bounds=sku_price_bounds) + assert min_max_no_agg.rule_name == "valid_sku_prices", "Invalid rule name for MinMax definition." + assert min_max_no_agg.bounds.lower == 1.0 + assert min_max_no_agg.bounds.upper == 1000.0 + + min_max_w_agg = MinMaxRuleDef("valid_sku_prices_agg", F.col("sku_price"), bounds=sku_price_bounds, + by=[F.col("store_id"), F.col("product_id")]) + assert min_max_w_agg.rule_name == "valid_sku_prices_agg", "Invalid rule name for MinMax definition!" + assert min_max_w_agg.bounds.lower == 1.0 + assert min_max_w_agg.bounds.upper == 1000.0 + + def tearDown(self): + self.spark.stop() From 700bfdf7f11853225e6966920192154b7e49925b Mon Sep 17 00:00:00 2001 From: Will Girten Date: Wed, 1 May 2024 11:11:05 -0400 Subject: [PATCH 2/7] Update gitignore for Python modules --- .gitignore | 1 + python/.idea/.gitignore | 3 --- python/.idea/inspectionProfiles/profiles_settings.xml | 6 ------ python/.idea/misc.xml | 7 ------- python/.idea/modules.xml | 8 -------- python/.idea/vcs.xml | 6 ------ 6 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 python/.idea/.gitignore delete mode 100644 python/.idea/inspectionProfiles/profiles_settings.xml delete mode 100644 python/.idea/misc.xml delete mode 100644 python/.idea/modules.xml delete mode 100644 python/.idea/vcs.xml diff --git a/.gitignore b/.gitignore index 7f7ecc2..295d300 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/python/.idea /.idea/ *.iml #local spark context data from unit tests diff --git a/python/.idea/.gitignore b/python/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/python/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/python/.idea/inspectionProfiles/profiles_settings.xml b/python/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/python/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/python/.idea/misc.xml b/python/.idea/misc.xml deleted file mode 100644 index 812ab5a..0000000 --- a/python/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/python/.idea/modules.xml b/python/.idea/modules.xml deleted file mode 100644 index 614b3c1..0000000 --- a/python/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/python/.idea/vcs.xml b/python/.idea/vcs.xml deleted file mode 100644 index 6c0b863..0000000 --- a/python/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 5faa6d4ee9cd7697e6c44d32862425e21bd4bcde Mon Sep 17 00:00:00 2001 From: Will Girten Date: Wed, 1 May 2024 13:58:31 -0400 Subject: [PATCH 3/7] Rename sample notebooks and update gitignore. --- .gitignore | 11 ++++++++--- ...dataframe_rules_engine-0.0.1-py3-none-any.whl | Bin 5141 -> 0 bytes python/dist/dataframe_rules_engine-0.0.1.tar.gz | Bin 4420 -> 0 bytes ... 01_generate_sample_purchase_transactions.py} | 0 ...py => 02_apply_purchase_transaction_rules.py} | 0 5 files changed, 8 insertions(+), 3 deletions(-) delete mode 100644 python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl delete mode 100644 python/dist/dataframe_rules_engine-0.0.1.tar.gz rename python/examples/{01-Generate Mock Purchase Transactions.py => 01_generate_sample_purchase_transactions.py} (100%) rename python/examples/{02-Apply Purchase Transaction Rules.py => 02_apply_purchase_transaction_rules.py} (100%) diff --git a/.gitignore b/.gitignore index 295d300..51a0286 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,10 @@ -/python/.idea /.idea/ *.iml -#local spark context data from unit tests + +# Local spark context data from unit tests spark-warehouse/ -#Build dirctory for maven/sbt +# Build dirctory for maven/sbt target/ project/project/ project/target/ @@ -12,3 +12,8 @@ project/target/ /target/ /project/build.properties /src/main/scala/com/databricks/labs/validation/LocalTest.scala + +# Python Wrapper +/python/.idea/ +/python/dist/ + diff --git a/python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl b/python/dist/dataframe_rules_engine-0.0.1-py3-none-any.whl deleted file mode 100644 index f33a54a04444d299c7373b5daf51a2249840d179..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5141 zcma)=2UOEb7KcN7bVLXsMVd52uS!$ty%<6lV~_};B$NaN6r^`hI!GsgAWfQ7A0S;y zsDkt&pn?Si1W`7;eXg3_yeNG0RR96-c*e*Gdb();d|kD zW5$~u3=Ok&M_hOE5Qjn$NCX-R6?65{6Yd=}#XZ;22aOBs_4VlMi1hRgjtGPLbxgba zM7za7mnn!VF)eo5a21d_n<4GTjw1fGwM3b z8S3E*b9aJzAdvRXa5M^u-yG9uNxDD@8jYJ-ZD&N}0Ve!(IDZld!q#E5UB^ZG(@t<$ zKI*m+)GyJh;!cg1m*Q7D{d`l&D~6BnITq6Zcznk61ZJfyUr_>1Ae)&g&7*D9t=!J| z@R@CKI8a!26%{h_Oa&j(O?w#0$JJf;Ot$Q1r#*8(uo-a?)w65(jSQjz0CBpP)2dcxGw{Wl-arNMw2tc6yME_^3jR~HxCzRs!+kXF&CuLLnPZUJ0%GC7f44DMZ5 z!xV>-DAnAH!mO2;FqCy?Hd@%bb<(WbWNaMf_j(P;V@EV`3=9FG834Pu7x~%^QJwqZ zv8d^3(pxTLIT8MbR{7mb^~uBn75yO#`Cyaf!bR7r6u&E-FJGUF`Ia8BhV^3aqT*q~ zvW`9Mw9<{iDfVvjnXRj&R$~RGLgwKNUUf{$$SYChYEzwLr}SgwAa)fNF}9V4!7GDn zF6Gj^`>hmX!ch->CBhxIKec!&g`XCZUufX$M@I~-il~v$F}>NYSMOps=4ZUr$9zRY z=IK=~iek9%T=TWuV%nVImc*dfEVf>C&^DQq!*w)_Ai1DG3(_$DJ^c&(DylcjWgLWM z<)mhYlnsi&pXfJBLV7x~oLODv#BaX4WX^+rBEF=b^zr@%h26XYit~lq*0jRpdvqn4Jn7-T%;k6Y}RtfCKp|q2zxEAAvrC7_2~vn z&-9p${?=I*WSiolP`DNu=GvEcuWR!BevLPUEPse<5di>CNdSO<2pUKC1nL1to_A@2^%yxHxZL4-QW$9z^?9%_)Y;$daY9nOa$n!O_1vZ^^zD$7 z2N|I#wVKlzv7bnIZ+lm0hozmB`U-7&_uikqj`SNKkFj8;CHie^SceL!@)cUmbHJ=E znk?%5Lh~@+H%rTsz`*kD%^;YRB)zIP2@h_>?nBgl%=;i5nhyjD;zR=j$YkDJ%GXVn z$zom9d+=%EQ%Cu!cq#Q$-gYyn@OvOitRv1kB)NP?Y&VIb#xZ~^X?I|gg```RZpk6j zxpZ_aj2mva%J`)JNz#fm>c)q)c3RGshUlJj^WqNsrFqF{Ukej+@5i?~#DgD!y4uV< zr#7nliah^xoO>fTyw+JQ2INdGbSo)O*iiP$$zqm!OZULW*-Xij`rDq-V4tsx<%`rS zdQUT0oEACK%o;rVe5^M~emx4~&;lG1AF5{XCiu@G3ySt~J&IdPx&t~QVrCTh+(TO= zqFuAN2nGaHw5Rtaw8ueSLn{3_*Y#IlSrxf)=fCK zWD20Tay~A%uL&Il65=wwPfZ_cw^Gn%+DdAiD8IVOyo@!w3*lEw(Y-u8KS+ylSn8i)aEiJQLpi@B>lHiGL~QF3iDq-aL%%tpSIWzfB_ zq!Z#KllyZwX=Eka4a9B4vd>AQ=4qwDyW?KNCveicD z6nf_Ff@;aU?R0qsN^^SM3mUYCi8O^5N>hd8&NQXs3J`U+faT6Vssd{f6lEC`@n!Lv z`(@Vt#$SYU*Do%VX|MTA=bAju-D`Day2RPBt!%>0JZ*dHRV}h=E+s+}OONr`y%2}N)nWb_$wa`3d^bgmssf}FCPQP{IcK?SPo%O} zE;t+qZs-~ZW-u;ch4~ww1M{ z>G{RO3CwUnVV_%gK|??JuYQd8ga1evk>i~LgGM+Xt7#G>K1}uE8CLjIF8n(z2e`8< z-2KSM`tM0n2VSBHzUil-8SIW`raC=Uq^5O}l5HdL?b-T=pWpjf4e93KN>gK|lXtJi zv7_k>?bvKt$ca8HD5wnW1umPhm=tC+UnDVyKn+&1>CaJh={M zAmcPI1&MdefXE(~-cnTV2nt~uT9B_-FIGM>mKv1@HFRYfuibx6-L&zfLVvjOj#{AL zuiLwF>&3-n{GFr3M{M!Gx8#O9%mofT3eX-Ng zL`7W%qLKk_MNUc3G`|(5SZABs@Jz!oQKyv&CzWap*VuR5aF(sa-rtlu{3^uqI*mxN zghdU@#nscb%M_2$m+DYvn3|Vu5;Yn-H`C!hKq0!TA3T!W17jZA3N%@tB{_xwd<)Zz=iok#7FjRNb8n?77F*Y!tydYvuU@r+eEs9s zGp197PMhM-)*Nr&uh4ItvAMQ}hCcP6wn6_8t)3`opl`4WEHNrQ?{3tuC#oybBVGya z))vtj=XVcY`M z6NLT5sYwe3{Ef^c1pq{U-vz-MYDQr7n&Ibhxe~y;3wJJ3Ph1J=wq?#%QMq&y=oU{! zpP=ZDXf5QjFKFm2PJQ#vpd6gou`5_;7^KJ{qmxooPQec!=8-AVDsH}JNbmQVjYLgN z*vkWLi*U=|Nb|TPxHmpCK7yup$lz&y38>Gt;A~R&tKlF&qUc`a$RnSKJ@VVnyf*9fiZ`t{nyB1jUb3BIb&TIkk1qpdslxk6 zMT|%LOM;vm>scd|KvAm(j{-4Frv&4w-?~MJE__Rgzh4j?PCMo zm$q!b{gBrBt6Pjy^Q2``c*zKD;yV);^-&bQ`9D|=p za$gLOGP*=8O~ih#b=t3Z%RYQzkBQ}ZFgvde>C0*MrYd?JxXFV^0h<^)u>ee^?}Ndf zv$G~86c^GB=lJ(_k!x9P!OOCUOutsnY_NGM?H3`NXV--@A+plb%4Iq+12}OY zYf*jb3#@lG1`*EK!!=@pnX}m5$E5%P^N(0jWH}#ag*=~<&|Yl}vnR^iYaMN~&gE_I zwu=}D@q<}FYmhyO{SPOT85PgZtuq6+HyMrfh=_qC|G!knd*WXW>wom|Q|0_q;ZDdn z`X%0z|7u6|_V33bp9a?J0Lf2bltFrk|G511&2>fcNL zjlA<+TM)tt)7TGqF#d0uqoex`_+LqlkW83?evtWSf1CX8d}ORghW8@?fENEU!+UAb IiKB1-0?JYHyZ`_I diff --git a/python/dist/dataframe_rules_engine-0.0.1.tar.gz b/python/dist/dataframe_rules_engine-0.0.1.tar.gz deleted file mode 100644 index 51ab2f1f48a57b167dede7f18dba52cbfa55f94c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4420 zcmV-K5xedmiwFP!5jI`||Lr~7a@)9)dFEGOnW|ILluSvMY}cwrsqxs!oa{-QRGgW8 z@NubV5|Wruq=o=(Yq$2_w{amzfRse}q9ilyhnNBz-9V#nxG}y<-D%=3@FYn?OeQ#* z1ra{zIX$P}C5hMd0pTVIysxA?nFLWlClhD6da(=i0FH-4{ujXf+dJ&{U-SpZ{i9*O zH)Q39y?%fA0`*?(0a8NU1e)^V?+Mc+ULYrQZ-{g2h5?Lr5JwKv1r#inaYE52c<%6j zKd+Y9T3=y8FfCTP#B>cdp-Jk|G{Gdp|H*pMz_$Q@2m5Ek_z)+s&HmZ5ZvXw>px@a4 zcRBvK8P`^Y-N%2>KOQvW|GkfY7<+CwAxk&;Iw3(c3o!-fURH*!@qaKh#{U=+fM)#n z7XOzkK8OzaGt2_=B`dhZgup;smgR*mA+j(V>5*k2fM58#1*Y@Z=LJ5Vq8X;(96+>@ zhg)7q+9UpoL4qOjM;Zg?CJjO#Ck|BmkenshrMT5@QiAW?{#lJWX}}iyKR)c4_J26+ zoiz5p50K_AJ%Gs9>1qi!o_T_I0il+7k?TS!rLT+yx--d2x60E^!da2 zhY@-c+#}wjc@R-FjT6L~Poi{zp+!VF&Mc>D5gblC%4sU#vWdW6M+iDwf#Ahqx`;+a zpYELTvI^uICLK2kFd0cnp)tCMBV2(i8jSCkNfq&J^`NmZ{=kN*(-r)>-c}{+?H{S0Ta3-v>0CNYU^k*t!>otGAW1$ z<83Zyz{r0XSh<7&EB{f6CFQ9QW#Dk z{Zf*Fv{Z-2lP~VAJ8_f5UFmc>#leL?UR$FnUVh;n8tbTEZWoRhFPshTgH?0fS#T5?kbCTj+`}UeSX%)di--JE# z5MG}66uJqzDNF*wiK?H6hm8aZQAAbM$7o}9BM=xQq(iAeffP+TC&1!XjoMg~oH-md z1eoFf3h3C43bb;kRfi16K6A5J$0-y%T^$k}{XN6ZX(&ysL4e_20IDVrcUDJ@0&+T= z6d2eeIB{~+dn=zHrIa-PY#>; z|F>NKJBML^lWTfh$j+kze#Y+~quEZs$*-f-(~I%>K;&Db^0dOGJ_(9C5oqg!R{-ZuHI2 zsoi}!+w;{^0k)KDRQpmIQovnWmGWf8yA!cBe2F zg&|HC3pZKSq$#mClgx(`sAl~u(P|mp=@?CEn_@1LwY~%Y&w~Dz0I+TUXD~Q2{r_;- ztpDx>3ci%hX{0MH&TK$;ZI-9o+8ec6X!D5((cfO=({P(BOje+yU8)5ffhB+INMMin zbFjb)D=W~^&bT$*Gb~|P_jZ$5KCk`f!Fsa;*kb>K!~UUZ|Hns-{r_e6f3gHl_>lG4 zTNvNrq=VAs5?AeX@Yio6FHA{ri^uC${&{r+&CHrODR3S9ul|ZxhgbY>MJ+`+tF8(v z=oFFlf+yI5_@aFBu4`vv_vig)+m3ZhN;uZXRFy+TV)a$F>b(`(z z2N5+k>=`M!p-McFB(X9`jiQvucW)MfSPS_sB~a*}8Q62w zCZ7NJ{-0t(N$q3)58nSf8JhmTf7qP={kHpm+WG$?l7?^=M^j4{#ANL~0eK?!6y+;J z0`YDhcysAH2x2F`E(H&{#*rPPov2k0IbnR*)dL_WE*BuzVB{#Ak!Z~bof;@bDupen zXd2A2g&HP~R@)(%o`zxTs$~nY`blsL;Q`;%#64pz{^-)VU3Y;aCJv5ngCvfw>`%YH zdwn^1eev7di`SF?I{jo{x35e4M6If;6(welXnKqN8H?w38OdEP*>RATt>&bBtAL=_ z+)vmQRkS-C?JWZ(e9M~Q`l}0f;S!3Imd%B5_0zTeECcXp|IhxN)V=GmyZ`U^Pfm>c z-~D5T-uVCht^cPmg=gbz>H7gJB2*MP(=_tf6p$d7pf_@N0A`EMmRlXg3mvs9ZMX`; z74$)S51*kXqt(iK?ywSe0q~9$a5ofgJo2r0T{UtjxtHGN5hrFVpHL?{_BXO3C^hU`gZ}H z!?Qg3>4xathd(l+9(NkPBEa1nYjTu12zmUqW&fJJg6b1iWM_wM$8Xr$EV$=@igjDN zUB?@1$4SRm_9aw51)}`n*X-d8obuWZh9Pn(5V%P}+{lG;_C*3E96s%lbG3X)7?c5n znv}bmFG9E4Xgo%hUAFmYWxi4^&#g0-U=;X%2~4r?*fltn4{QpKxd?&NTYjZNE_*+g zV}7cWb6R&&AJ~4LW1%?03Tv`tNL3v*Px!Wr9LPLQLtoToXa7-#%kS%8C}8IhWa!<2 zB|AjBnGs+lcqUwqZCI_=U}(e^+uhjejPtGr?8?kJ8~%7IZ{0`^8=XQSpNc~!99B!W z=C@fN&m@XeZ)wWKigx?K25630sXZl=Ti31!m|dCKI;+!q`zXU()4>gP#}3>)*yt*{ z_{{cJ$?-wvRu!)*>_;kkNr=`?HMS@0CIQK9bm96CpfJ6| zILg_w8+|$P8lefqk}P7jBT*|j)krn-t4HHjtJ=;Fs_h)T8lw}Z_XGyNg!GEGa;q)O zM4T7bg%g>vr`x&k({ItLgQuraiv|>f9u@!d%*tR z;n2MQ(LXtAzW@1c)_-xShDwARmgFRu_mv>*T zG{@QM_j-LW{r`z0JaNIV9)TqqX)VJ!%y4c1=kUlGR^SXW91n2#Ia`I2pWNWr7z_SE z%&GHRaK+ZP9U7m%Y1ux$4Lm#vd^_VH9$B7@6PKM~E@ZT7uaPds_c9-T^-)n>8=tce*syz=knYH73)JtJr7<36R#joekw<+#sX+*xmpRmE`Cvb>*{QflRT@7A5GnW%^$g% zSKWWi>J@~gZg#eviu1zl%4|R*=G?(F24|U=Xn`;^Lotdn?>{0&{>{(-WP4Y&?s`2W z{vREml)nGnod5lfeQytEyP{$O7 z(2v6R5_7REf@rdE?UZCp)K6bm-PfoB1t(QDZBM}c-l8^5xzx9oHC}-hS#B+&1&rH!94o;fnE zj@O~_B783U?;rO~`#@JQRayFP1u4R4zH~x)OKf*xPG@7I9OLw*{&`5moZ!JtDra|#WbXIGBS zGhV36pV@kKVada`+!Xv|k}cT(FQ0de?Q=H@aELy~3qD)Kkc)(4XgDinR)fheg@qJI zPi*vmokIQp4#E)q79`Nvy!ss!Lhj89t7>0cU-9ZLPJAIr-u`S4m`_>rFIw1aNL^ou zQ-yO|jq`!XdDz=m&tk@;jLEMTEcqK<7B=LT#7^V#?N&<;x8B17F)_WG}fQ z^GcWKZItzf+DvRp#hf$=$+h)Q7@s9;1Jh?pmt@rK!jxy8-Z*+9$@9o{+W3}`l*{VD zG`ayN@hv`Bw$cB8{MT;>Z!g|_cqET){(orx|BwCQVXwLWzZZB9_QZ7H;1A(bN9fS$ zTNnJ}K-J1>DNL>;)}`2$8KI|ly4FWwt_Qz=dN)FuwRX@$?M$a9?DpZ*q}5udJTM$^ z-#tRbfPjA95C~p%P{H}5pVvG6SJuCzzaI&)cDVF9xtq7nlv5s|I*wDTu-$uV=$pOx z4ye-MTc5IEB#K9fhoZdrJRtcOlC1MQ)E}I&C5nh1@K0AFwp!5zUHt0Dcadc^Q=0}F zXrO@x8fc(_1{!Fffd(3Apn(P&XrO@x8fc(_1{!Fffd(3Apn(P&XrO@x8ff4z3I7jV K1Nxo-pa1|%Ub|KR diff --git a/python/examples/01-Generate Mock Purchase Transactions.py b/python/examples/01_generate_sample_purchase_transactions.py similarity index 100% rename from python/examples/01-Generate Mock Purchase Transactions.py rename to python/examples/01_generate_sample_purchase_transactions.py diff --git a/python/examples/02-Apply Purchase Transaction Rules.py b/python/examples/02_apply_purchase_transaction_rules.py similarity index 100% rename from python/examples/02-Apply Purchase Transaction Rules.py rename to python/examples/02_apply_purchase_transaction_rules.py From 6d451deff917ec2e4c1566c96ec1b08a4b4d2776 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 7 May 2024 10:11:19 -0400 Subject: [PATCH 4/7] Update README file with build instructions. --- python/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/README.md b/python/README.md index ba84402..296d1c6 100644 --- a/python/README.md +++ b/python/README.md @@ -45,3 +45,11 @@ def valid_date_udf(ts_column): # Create a Rule that uses the UDF to validate data valid_date_rule = Rule("valid_date_reading", valid_date_udf(F.col("reading_date"))) ``` + +## Building the project + +A Python `.whl` file can be generated by navigating to the `/python` directory and executing the following command : + +```bash +$ python3 -m build +``` From df68909d50359ac818d3aee35d047ed91277d1cc Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 7 May 2024 11:43:09 -0400 Subject: [PATCH 5/7] Bump up codecov version to 3. --- .github/workflows/scala.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index 490c3d3..43580a4 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -23,6 +23,6 @@ jobs: - name: Coverage Report run: sbt coverageReport - name: "Upload coverage to Codecov" - uses: "codecov/codecov-action@v2" + uses: "codecov/codecov-action@v3" with: fail_ci_if_error: true From 7f1edfdbd579af999497e234eb80a7ae9de4dacf Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 7 May 2024 16:46:21 -0400 Subject: [PATCH 6/7] Add retry to codecov --- .github/workflows/scala.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index 43580a4..f40d971 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -13,16 +13,23 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Set up JDK 8 uses: actions/setup-java@v2 with: java-version: '8' distribution: 'adopt' + - name: Run tests run: sbt coverage test + - name: Coverage Report run: sbt coverageReport + - name: "Upload coverage to Codecov" uses: "codecov/codecov-action@v3" with: fail_ci_if_error: true + max_attempts: 3 + retry_on: error + retry_wait_seconds: 10 From 902575a11c478712bcb741df8cd4b8fdd4fe5968 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 7 May 2024 17:04:29 -0400 Subject: [PATCH 7/7] Remove invalid params in codecov block --- .github/workflows/scala.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index f40d971..04d4e5e 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -8,28 +8,20 @@ on: jobs: build: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v2 - - name: Set up JDK 8 uses: actions/setup-java@v2 with: java-version: '8' distribution: 'adopt' - - name: Run tests run: sbt coverage test - - name: Coverage Report run: sbt coverageReport - - name: "Upload coverage to Codecov" uses: "codecov/codecov-action@v3" with: fail_ci_if_error: true - max_attempts: 3 - retry_on: error - retry_wait_seconds: 10 +