From 0598f0eab584fb3a3b2f003bbc968748ad5a23e9 Mon Sep 17 00:00:00 2001 From: Andy Challis Date: Mon, 29 Jan 2024 16:06:02 +1100 Subject: [PATCH] Adding flatfile demo showing how a developer can make multiple outputs from one schema (#97) * Adding flatfile demo showing how a developer can make multiple outputs from one schema --- .gitignore | 1 + examples/flatfile/BUILD | 5 +++ examples/flatfile/README.md | 17 ++++++++ examples/flatfile/main.py | 68 +++++++++++++++++++++++++++++ examples/flatfile/output.csv | 6 +++ examples/flatfile/output.json | 1 + examples/flatfile/output.jsonl | 5 +++ examples/flatfile/output.parquet | Bin 0 -> 2034 bytes examples/flatfile/output.xlsx | Bin 0 -> 5717 bytes examples/flatfile/requirements.txt | 5 +++ examples/flatfile/schema.json | 8 ++++ 11 files changed, 116 insertions(+) create mode 100644 examples/flatfile/BUILD create mode 100644 examples/flatfile/README.md create mode 100644 examples/flatfile/main.py create mode 100644 examples/flatfile/output.csv create mode 100644 examples/flatfile/output.json create mode 100644 examples/flatfile/output.jsonl create mode 100644 examples/flatfile/output.parquet create mode 100644 examples/flatfile/output.xlsx create mode 100644 examples/flatfile/requirements.txt create mode 100644 examples/flatfile/schema.json diff --git a/.gitignore b/.gitignore index b4d5b59..2510a5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Application *.json +!examples/**/*.json !.vscode/settings.json !jsf/tests/data/*.json TODO.md diff --git a/examples/flatfile/BUILD b/examples/flatfile/BUILD new file mode 100644 index 0000000..a5191a7 --- /dev/null +++ b/examples/flatfile/BUILD @@ -0,0 +1,5 @@ +python_sources(interpreter_constraints=["CPython>=3.10,<4"]) + +python_requirements( + name="reqs", +) diff --git a/examples/flatfile/README.md b/examples/flatfile/README.md new file mode 100644 index 0000000..9660f1d --- /dev/null +++ b/examples/flatfile/README.md @@ -0,0 +1,17 @@ +# Flat file example + +## Dependencies + +* Typer +* pandas +* openpyxl +* PyArrow +* jsonlines + +## Usage + +Using `main.py` in current example folder + +```bash +python main.py --schema schema.json --records 5 --output-format parquet --output output.parquet +``` diff --git a/examples/flatfile/main.py b/examples/flatfile/main.py new file mode 100644 index 0000000..7c9e056 --- /dev/null +++ b/examples/flatfile/main.py @@ -0,0 +1,68 @@ +import json +from enum import Enum +from pathlib import Path + +import jsonlines +import pandas as pd +import typer +from typing_extensions import Annotated + +from jsf import JSF + + +class OutputFormat(str, Enum): + csv = "csv" + excel = "excel" + parquet = "parquet" + json = "json" + jsonl = "jsonl" + + +def main( + schema: Annotated[ + Path, + typer.Option( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + help="Path to the JSON schema used to produce the fake data.", + ), + ], + records: Annotated[int, typer.Option(min=0, help="Number of records you wish to produce.")], + output_format: Annotated[OutputFormat, typer.Option(help="Fake data output format.")], + output: Annotated[Path, typer.Option(help="Output file path")], +): + faker = JSF.from_json(schema) + fake_data = faker.generate(records) + match output_format: + case OutputFormat.csv: + pd.DataFrame.from_records(fake_data).to_csv(output, index=False) + case OutputFormat.excel: + more_fake_data = faker.generate(records) + custom_header = [ + v.get("title") or k for k, v in faker.root_schema["properties"].items() + ] + with pd.ExcelWriter(output) as excel_writer: + pd.DataFrame.from_records(fake_data).to_excel( + excel_writer, sheet_name="Fake Data", index=False, header=custom_header + ) + pd.DataFrame.from_records(more_fake_data).to_excel( + excel_writer, sheet_name="More Fake Data", index=False, header=custom_header + ) + case OutputFormat.json: + with open(output, "w") as f: + json.dump(fake_data, f) + case OutputFormat.jsonl: + with jsonlines.open(output, mode="w") as writer: + writer.write_all(fake_data) + case OutputFormat.parquet: + pd.DataFrame.from_records(fake_data).to_parquet(output, index=False) + case _: + raise NotImplementedError("Unable to produce in this file format yet") + + +if __name__ == "__main__": + typer.run(main) diff --git a/examples/flatfile/output.csv b/examples/flatfile/output.csv new file mode 100644 index 0000000..730f90c --- /dev/null +++ b/examples/flatfile/output.csv @@ -0,0 +1,6 @@ +name,email +Jessica Bennett,eric90@example.org +Christine Sanchez,epeterson@example.net +Emily Ayala,wreed@example.org +Julia Dickerson,dwaynehoward@example.com +Justin Miller,emiller@example.com diff --git a/examples/flatfile/output.json b/examples/flatfile/output.json new file mode 100644 index 0000000..4632c55 --- /dev/null +++ b/examples/flatfile/output.json @@ -0,0 +1 @@ +[{"name": "Melissa Jackson", "email": "reneebullock@example.net"}, {"name": "Miss Tina Morales", "email": "alexandergeorge@example.net"}, {"name": "Jake Vazquez", "email": "jeffreyreeves@example.com"}, {"name": "Terri Taylor", "email": "nperez@example.net"}, {"name": "Kayla Williams", "email": "davidprice@example.org"}] \ No newline at end of file diff --git a/examples/flatfile/output.jsonl b/examples/flatfile/output.jsonl new file mode 100644 index 0000000..2adc0e9 --- /dev/null +++ b/examples/flatfile/output.jsonl @@ -0,0 +1,5 @@ +{"name": "Judith Lara", "email": "rhowe@example.com"} +{"name": "Joseph Warren", "email": "carrollandrew@example.net"} +{"name": "Marilyn Thompson", "email": "tyler22@example.net"} +{"name": "Teresa Brown", "email": "tharris@example.net"} +{"name": "Derek Singleton", "email": "lindsay66@example.org"} diff --git a/examples/flatfile/output.parquet b/examples/flatfile/output.parquet new file mode 100644 index 0000000000000000000000000000000000000000..86d248885503d1f227dd94a6494440d7a462eb35 GIT binary patch literal 2034 zcmcgtO>Y}T7#=%`Tau^*${M=_w6a>$gKKOj8%V{WvrcTUNz%q4yS5REW05;d$-=WTOv%{BP_JZ zzS>a9zIF^KrkXc?s_ZKd?bGLx-5*{Qk z+)u+ipPHF@-ay4))62qQUP76rd~TKz#}d-n`RvbW_$&UFD&G7@z>hT+=UBWp|?k6LBdAKU@6G4$;qv>9P8B~19>5wpT9*mdIT zmLC$Ilp*Zh)F*d|Pu(DJ>~L6q4?TMyVwi0VAuAYKKIEI$={Y!nT+^A`*VK^9UNBuw z2l%U<`r*b!r{*k&*feg{FEiN}GsWkb;$JhoSFc5JUGMUrN+_2Jj3H16vq)IHmrf4m z9@<)lqGChBKec4%^UL|te{qt<|7FkF?=so>;?3-zbMGLOFXb|zc#{OV-_vIW8%}k6 zV7N4LSWtK5esKHMRk{sis`+hyEZmCBWJr2u6XrtFuTwQ`O7P;L=-I*+x7iRN<8G9U4AI;dle zcj2Fp&oExa@&Gq59^m>Qw#o7p|A-qm&;9KYxcM4Jj1v15j7Jz8@Z=zJh5L9QZtlZF z{#bEn)NiV*-P+EH-cm!IZQGA+S?}(R28uJa(RELEI@D^ar^=B#(5jU{ske{x+PJD~ zQes;PY~5*u{XJz3d{IxUQpM&NeXp;LkGnN#sB8OxjbFxbj8RXMj(gpmL$gIk)7+}= zP{8g?^QEfZm8I@VM>3kz{95~IU)!l9_*H5)8-URn^4fH!26_wXYh9^_>RQ9o>k8*| zh^7Yp#(GjCz{(9qMOlXte0%nooLg z$M#ls6ThZbdRk{_HaB@MBU2j(hicL@!%@M$KH<35q#Y2gJ?VAZQoodwh!Z!Cx@fAI5#HIwTs`(}u$1L1g<}PoU2h*B6Z4i=@p) zEJFFEKDnC@&L_t`kC#bG9Ex(HNBpOe6+SRL??L=wiPH~9Og^c63GQNk9se|k&=dHz F{~I7FL+1bh literal 0 HcmV?d00001 diff --git a/examples/flatfile/output.xlsx b/examples/flatfile/output.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..666ca20a7fc6dc39493f2591756f6e7217fc13ab GIT binary patch literal 5717 zcmZ`-2T)Vp)(z4_@6u5MQl<9}5_*#Wp$LdXTBxBH1q>inf;6eprK9vB9VCKusZyi} zC`gkk!izfp^OOI3H*-%WGk2}*yJwxf_gVT{*f`Vx0Du6%6D$V>n{^O}Vt&2E+~kUX6pn0m`B{=od{jZPyK|Vtj*bO`|O` zn105*Su4H9)FQUTy-`{pmYU(2;wZ#*oZ_?}7dF|#TIYIl7q{){DX#us@s@V`6o1CV z*^dPPQ2aCAd#)a^p8*%3I&`}PiNmd6Z(n=OU{M;R^mFR6T@xbg=GScDHygVZY0zAM zgWnuKl=02+#o4qm12giFB0G=lD%!@3xMC||_-+sBEhPXbNU`Ylv&fqfGOU@oXE6Z; znyl|xaz34rG9?H{QO9&%9{_tBGsy9FsljiAj0E#bZ*nOa8N0y<^Rym{YooPE(SwkM zBUoETLyG~ftKJiVvAwV@%3@vir?Df-YLJyYCIMhkj`O1VOR1rAp68}3grT}vCYuQ@ zHz(7#u%C)mx&kd}4t^vr04*ca+MZOf{xHee@BO-|5JMLNUz}`1>;0XWXBHXkRLxPYfU%o+B375bT)Ei-e(j3o7g?aE9_y&rpuG`6iROBN z`9%|95QUN;eNp93JFr08vewK2&?Om!rlRcn5E@OO1e@lmO3nw&`vRAUBtOb%vP%0PTVjKII+7l20&jaokBr9f%0Qm!tD zmWc87O!y|X@0MrCGE-5$OC-Wm3QqxXQWi@+&{3y4=Pj5y&lpqONfZE`YNDPvRI?X( za=u#^Vdwx?GI~YWkIMR=uloA}C6aqv^eaLGur5YsBsxovL))bu-V7Ql*lzqLmor)~!5(DSF&1q;4|Xx{ z{$3$8GbwtL>z;|nyS_J&%BS2@QrYdBKkVhRcDGKaLCDH9Q+{^Zc&(6XB~Zgz^xGYt zZo+k2a}L6O$QVo_iZV-?(PhK%Kr?A_wX%_`LDqa_qcX=+o^R+x)@rTh9g&Diy@%H4 zy2RF6PD#ygF9+OoS^9}(>|_&|>wKOwi+w-5O?k0<2Q) zl6v(;sh#0vAU<%tb;IN?AN8WGHUD$`N0kOw_HG+l3z_#~^Ty`}YR9OqyRD^ISS^nv zEYB9gYF#S1edwL9-e?B8Jrr1; zs#n;PnDA)ngje*xCdA`WVZBR4-`cqnYFo$@oO(+1dXKn~n`W?FUh1JL-=lC{%TW$W z>ZmGW+-r0m%iG&e9erXZPx(Tekq}|q_jGSnB0JOOSWX46U5j zT3!Awnfx~{W-Z6%YUt#^`+3h6S*jxt>HZC!s%00@36>M#1u3^#>3M9+PgHy=`42jR zqN0K+N^M%3D)mk{GGuB3c0<_G&n9N`59bz%1IJ6v3T$~P6twUISrhQ?#h8B+wR%Er zQ(pM8ymv;3*e@8mPn>gKav?%BjX~snbeU8v&{gTZ;;O>zIEnosi6|2a*HY+Nop|1g zE>QNv`8G*aXLsDkt>`wazZ=NZiwA{YFkO`p7XY~WyMg$)dN?8MVK7gGz^|`gT_iSf z&SOT9Jj_aU8D9|za2DwuQ%*mxrx$jvpbO|oQ#Ug)`@D0A!g0697ioNDJSz3Qe8p%! zq4`McdNh7sc{WT8f#4_*a1vd_AAe55>*RqaiF9EPS|%?637f@*cNL31ha0areC?%? zayI3Sme4&e7h3d2jE@h#ir#dvDHaziA~angf$z|sm{8$dOU%?OAU3P)14eu?J^Z@l zD6bP$vs~Xf!5XW|cVE?$_0iCKK1ML-IMkyic;_UVXk1zvH$ee^Y*Wes!tx}0v(oSf zmMbm`UH&$t=YTIDGZHvi^}Z$lkc!1IO_seCKm5ASIFK%ye@~e7_8}y2ssb)sV(xv{ zC@j#z!nKxGq22nXRC~9EDNFwT{wUc;&TnnP^sjsf@<{KFFYn-Lnnk)U%1#@eeD_S! zO{hNx6pl!8p))=o8iPa%uc$HTrrD_@l{Qrnq(jRZ$| zBhlIKFTeqb^31#{7TdkcaiA8SBJ~vmqMjQ|G7Yh>EbT4ump?8VaZPLmxUdR}6^j{n zYjcu(*@5IlGj^V)J#J<@3_LoedGfbFTtax}_b>t>``-c){yApECc3&}1k&3Ul|&ZI zP2FUqQPVl&oF;y4cw&;Xk(}%Tz1ctdb?hkPc8Vr)%P6oed+ov6afbKARuD<>rV{UR zbv_tgA$6omVs?yjZXec!WzwqW%(^c~lEVZ>qx5>0r6!frKziRBlll#3a})54<4%m7 zKC;j~ho^ARq5HedzB+E+@XZpp!Dnn|B_ZTm87WF+h)J0c#5YR*U|}Kf>2>bHuA%bU zlU_dR>RNBsPytwS3^o-Pmmrkj^elvfizego%29`?pAJLfn`OH1&8-hQXMto|rC(u@ zUdb(5P{Jg8nhNJ5dQ!1>mJ|v=YQN9$Hx^vtdeMUG)ki&J-VU7x&ch=m`GKhmwVE%t zl~vxNtUEyhq@z848~i^A4~%=c18$|>LZXAaVk>E6Uzr!=tIDbh=I1XoQIWbyUjWFI zC4gYO9pVAePHlX?NpB+3=~3^_3}yOJ2aQP+Zr_JA2Efvx=N90^eysk=+ts!FnICF* zj;Qn7^YRnCAp0)mBoa@6F@onU?dJM<~_>J%VqXo6V6q z`B$}vk2L%h@aw^GoFo07bO2d z(~X~~qi^s+IbVEKGik=LU@oFH%{r9B>V;6LZCk5I9aX|6>D=MV=g3{A>_!<|xrV%z zC(LU%rMQYCc(yXWD8QzxzBJf$;g2Cm)Sb#~NRz#@NnWp((Z}c8*0nVA3ilv&C7k3+ z0#?Iq?Z8_l-)`!>de;t~%BfAjKz~suwu}CsA&Z$8P{K$i&Ir5S!|+&k!Fz2S9NWCR=-R&pbD; zwZ4YE>DjyI-?2mHX2>|}7O=|oLfi>$9zo1|m`Rthv1`D%7NcBD0e2N8LS@*ScrNtu zcrlCGyz(kl?W%8DUo{T$rovUf`=%hU+gV{2!9vuxJj|K1Z^gLa$+xM2T*Dsj<3Xh+QU2u3uW5jPM2q3%9WiJjyzFE&CwZrjmwix@~s( z{yW{4d=cZ8V2KPgGnZt{rbxZ+rara~5Lde9rRLg|6OTLZ&&zan6pc2>WMAxEAMvLi zC1f_qv}dTWst0V!7@u*R9&^C6 zQd*@x*K2C&K~sRiC_78%N2ESgFmC$ zNEyvLV2eafd7Fk6PAM{4y^(Lfg{yDRrQS8-C5ID~`uG}g0>$-+Z(;N2b$cS%f+FG% zQ%PR+SKm`9loiRfeJhfr_V~p$kGBa|0i$#teTDHTS@*Kh1n7J6*hw6O zALA*<%&8$cz?R4ds0gm1{sc@woqY((PHY)$)Lm;fP~zH0RsVhYe% zx?b%|R69n=v=~)mL=CX=fWZ+0H-D~~30f{!goqU@DPdOH#obqg%BHGR9uqZ0uH`-J zM&0ZFb{IR=&Sp=cb&-|DUr8qV+?xq8TLAAB?6#OT7Apt5a^kkma?geGh-oSE=ZMA+ zA54++ecI>f7uGNKhhI?3owQ{N7DSFt zsuFU#R-IfrU0*$59=wGq@d(U|lKtMgKW6|NS68QBl^r$r^K2bv#qkzvzgY?TMdb+v8OdWkSo_JZnHL94Z0`aSW;2)B;N*`*DqR#&(}wT++o<@ zMk2?Ka7!I{A{oavRf!YdT~(M}`zc6k%&+r#lG>|^jtO+gA#1V|J!F*u;QB3C7+Bt7 z8aHSO1&u|?Chiu#0K+2OJQlEDQmc7FpMx(@{paS7sPXt2WL1*-A;8JujjXFfwy#HL z6Ewb942JiTPTf0EnFpu4mbkrMGTLPjS@X!kvyR;iR2bPe67)j~%0e=W#2m?5ZnJz% zZKB<3eS^G;2C210?NS@l_pO>SD>wEq%XPl68cksvv^lF|d9=Dd;nqSWOBtpK^_Sw}Z=Ao24SZJj!xfq7VU%^jy;*oFSeK>N7h4bPYXB80Fw9_R71QZ2PA3_z`IOA*t$ut9m$sj zTS68@Y?C*i<)$*7E7y3-O@s==VYecWw8}|NX879~q&n7;#zS z{B|hJRa{*>VJ@Cl5I;8<;?7Tj^+5VD0wX_5>YL>8-Uz{V^v|)#!GF*WO7D%b+H=u5 zJ%Ur35kkyL*xVy&wl@cS-|X!jkYiO3nm!4$f4ubQHdK63^euS@qJR|bu(VKPWROxm zA8IzRc2q)~)o!RO7Xjs;)c=xMuvxt9EZRYLJwz2H7uRlCN-hm{($ujq39t$>L@UsM zd`PdtwY%xDZxs$|5*M)2#Kj5(sG}kA&Yrn;TUQIEOXEpt^({Y1E(?#>a_0b72i>E) zQx#370P)oxpv*fcP8Q*qg=RD!URrb%@mxMR%23O#d0r^FRpyB{PUJw6N*3r@z#d_a zUQ6ib`X$+qNAWY)s32X!K?7C@`P~Ia+udyNyt4hpY3EU?=gjd_EHPQWQZjultSi*m z|J%I72e3YR@Fbz|7WaShF>} zLVyIn2>$i>ahc`vRQiX77x=F%zh>0SESG1N|FU2i68~cPGr?SjUTzM5pd6SI`j1v| z8F;za{sEF=oLEc(|5*brgD)43Kj4QLTNDHSA7$h+&*e?|hsTK$0Qi?Js;`BMaTx&s PAm)R_RITQpW+UJ~opRx5 literal 0 HcmV?d00001 diff --git a/examples/flatfile/requirements.txt b/examples/flatfile/requirements.txt new file mode 100644 index 0000000..a3c1d84 --- /dev/null +++ b/examples/flatfile/requirements.txt @@ -0,0 +1,5 @@ +jsf +pandas +openpyxl +PyArrow +jsonlines \ No newline at end of file diff --git a/examples/flatfile/schema.json b/examples/flatfile/schema.json new file mode 100644 index 0000000..657d69c --- /dev/null +++ b/examples/flatfile/schema.json @@ -0,0 +1,8 @@ +{ + "type": "object", + "properties": { + "name": {"type": "string", "$provider": "faker.name", "title": "Full Name"}, + "email": {"type": "string", "$provider": "faker.email", "title": "Email"} + }, + "required": ["name", "email"] +} \ No newline at end of file