diff --git a/.secrets.baseline b/.secrets.baseline index d1cdb1e7..049382d8 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -134,7 +134,7 @@ "filename": "README.rst", "hashed_secret": "077d5a0e0f8bb517307a6e92a73b0a9aa959233c", "is_verified": true, - "line_number": 578 + "line_number": 582 } ], "docs/_static/examples/recipes/sftp_storage_1.py": [ @@ -219,5 +219,5 @@ } ] }, - "generated_at": "2023-11-18T23:53:12Z" + "generated_at": "2023-11-19T23:07:25Z" } diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ea3408d9..ed1d3b8e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -28,6 +28,12 @@ are used for versioning (schema follows below): 0.3.4 to 0.4). - All backwards incompatible changes are mentioned in this document. +0.17.11 +------- +2023-11-20 + +- Minor documentation fixes. + 0.17.10 ------- 2023-11-19 diff --git a/Makefile b/Makefile index ba84a5b3..710bdc5a 100644 --- a/Makefile +++ b/Makefile @@ -132,10 +132,14 @@ make_migrations: echo 'Applying migrations...' ./manage.py migrate -make_release: +release: python setup.py sdist bdist_wheel twine upload dist/* --verbose +test_release: + python setup.py sdist bdist_wheel + twine upload --repository testpypi dist/* --verbose + migrate: cd examples/django_example/ && ./manage.py migrate "$$@" diff --git a/README.rst b/README.rst index e253ee3d..fcd5bc22 100644 --- a/README.rst +++ b/README.rst @@ -71,6 +71,7 @@ faker-file .. _gTTS: https://gtts.readthedocs.io/ .. _google-cloud-storage: https://pypi.org/project/google-cloud-storage/ .. _imgkit: https://pypi.org/project/imgkit/ +.. _nltk: https://www.nltk.org/ .. _nlpaug: https://nlpaug.readthedocs.io/ .. _numpy: https://numpy.org/ .. _odfpy: https://pypi.org/project/odfpy/ @@ -85,6 +86,7 @@ faker-file .. _python-pptx: https://python-pptx.readthedocs.io/ .. _reportlab: https://pypi.org/project/reportlab/ .. _tablib: https://tablib.readthedocs.io/ +.. _textaugment: https://pypi.org/project/textaugment/ .. _tika: https://pypi.org/project/tika/ .. _transformers: https://pypi.org/project/transformers/ .. _wkhtmltopdf: https://wkhtmltopdf.org/ @@ -117,8 +119,8 @@ All licenses are mentioned below between the brackets. requires either just `Pillow`_ (`HPND`), or a combination of `imgkit`_ (`MIT`) and `wkhtmltopdf`_ (`LGPLv3`). - ``MP3`` file support requires `gTTS`_ (`MIT`) or `edge-tts`_ (`GPLv3`). -- ``PDF`` file support requires either combination of `pdfkit`_ (`MIT`) - and `wkhtmltopdf`_ (`LGPLv3`), or `reportlab`_ (`BSD`). +- ``PDF`` file support requires either `Pillow`_ (`HPND`), or a combination of + `pdfkit`_ (`MIT`) and `wkhtmltopdf`_ (`LGPLv3`), or `reportlab`_ (`BSD`). - ``PPTX`` file support requires `python-pptx`_ (`MIT`). - ``ODP`` and ``ODT`` file support requires `odfpy`_ (`Apache 2`). - ``ODS`` file support requires `tablib`_ (`MIT`) and `odfpy`_ (`Apache 2`). @@ -131,9 +133,11 @@ All licenses are mentioned below between the brackets. - ``GoogleCloudStorage`` storage support requires `pathy`_ (`Apache 2`) and `google-cloud-storage`_ (`Apache 2`). - ``SFTPStorage`` storage support requires `paramiko`_ (`LGLPv2.1`). -- ``AugmentFileFromDirProvider`` provider requires `nlpaug`_ (`MIT`), - `PyTorch`_ (`BSD`), `transformers`_ (`Apache 2`), `numpy`_ (`BSD`), - `pandas`_ (`BSD`), `tika`_ (`Apache 2`) and `Apache Tika`_ (`Apache 2`). +- ``AugmentFileFromDirProvider`` provider requires either a combination of + `textaugment`_ (`MIT`) and `nltk`_ (`Apache 2`) or a combination of + `nlpaug`_ (`MIT`), `PyTorch`_ (`BSD`), `transformers`_ (`Apache 2`), + `numpy`_ (`BSD`), `pandas`_ (`BSD`), `tika`_ (`Apache 2`) and + `Apache Tika`_ (`Apache 2`). Documentation ============= @@ -360,8 +364,8 @@ functions): .. container:: jsphinx-toggle-emphasis .. code-block:: python - :emphasize-lines: 7 :name: test_usage_examples_with_faker_raw_recommended_way + :emphasize-lines: 7 from faker import Faker from faker_file.providers.txt_file import TxtFileProvider @@ -396,8 +400,8 @@ If you just need ``bytes`` back: .. container:: jsphinx-toggle-emphasis .. code-block:: python - :emphasize-lines: 6 :name: test_rst_readme_usage_examples_with_faker_raw_but_this_works_too + :emphasize-lines: 6 from faker import Faker from faker_file.providers.txt_file import TxtFileProvider diff --git a/docs/_static/examples/prismjs/sample.js b/docs/_static/examples/prismjs/sample.js deleted file mode 100644 index c25eda96..00000000 --- a/docs/_static/examples/prismjs/sample.js +++ /dev/null @@ -1,5 +0,0 @@ -function foo(bar) { - var a = 42, - b = 'Prism'; - return a + bar(b); -} diff --git a/docs/_static/examples/prismjs/sample.py b/docs/_static/examples/prismjs/sample.py deleted file mode 100644 index 299cadc9..00000000 --- a/docs/_static/examples/prismjs/sample.py +++ /dev/null @@ -1,7 +0,0 @@ -from typing import Callable - - -def foo(bar: Callable[[str], int]) -> int: - a = 42 - b = "Prism" - return a + bar(b) diff --git a/docs/_static/examples/recipes/augment_file_from_dir_4.py b/docs/_static/examples/recipes/augment_file_from_dir_4.py new file mode 100644 index 00000000..01ee313b --- /dev/null +++ b/docs/_static/examples/recipes/augment_file_from_dir_4.py @@ -0,0 +1,36 @@ +from faker import Faker +from faker_file.providers.augment_file_from_dir import ( + AugmentFileFromDirProvider, +) +from faker_file.providers.augment_file_from_dir.augmenters import ( + textaugment_augmenter, +) +from faker_file.providers.docx_file import DocxFileProvider +from faker_file.providers.eml_file import EmlFileProvider +from faker_file.providers.odt_file import OdtFileProvider +from faker_file.providers.txt_file import TxtFileProvider + +FAKER = Faker() +FAKER.add_provider(DocxFileProvider) +FAKER.add_provider(TxtFileProvider) +FAKER.add_provider(EmlFileProvider) +FAKER.add_provider(OdtFileProvider) +FAKER.add_provider(AugmentFileFromDirProvider) + +# Create files to test `augment_file_from_dir` with +FAKER.docx_file() +FAKER.eml_file() +FAKER.odt_file() +FAKER.txt_file() + +# We assume that directory "/tmp/tmp/" exists and contains +# files of `DOCX`, `EML`, `EPUB`, `ODT`, `PDF`, `RTF` or `TXT` +# formats. Valid values for `action` are: "random_deletion", +# "random_insertion", "random_swap" and "synonym_replacement" (default). +augmented_file = FAKER.augment_file_from_dir( + source_dir_path="/tmp/tmp/", + text_augmenter_cls=textaugment_augmenter.EDATextaugmentAugmenter, + text_augmenter_kwargs={ + "action": "synonym_replacement", + }, +) diff --git a/docs/recipes.rst b/docs/recipes.rst index 824f0567..4f04b554 100644 --- a/docs/recipes.rst +++ b/docs/recipes.rst @@ -1,5 +1,11 @@ Recipes ======= +.. External references + +.. _nlpaug: https://nlpaug.readthedocs.io/ +.. _nltk: https://www.nltk.org/ +.. _textaugment: https://pypi.org/project/textaugment/ + When using with ``Faker`` ------------------------- When using with ``Faker``, there are two ways of using the providers. @@ -613,7 +619,14 @@ however narrow that list by providing ``extensions`` argument: :download:`here <_static/examples/recipes/augment_file_from_dir_2.py>` ---- - +Actual augmentation of texts is delegated to an abstraction layer of text +augmenters. Currently, two augmenters are implemented. Default one is based on +`textaugment`_ (which is in its' turn based on `nltk`_) is very lightweight +and speedy, but produces less accurate results. Another one is based on +`nlpaug`_, which is way more sophisticated, but at the cost of speed. + +nlpaug augmenter +~~~~~~~~~~~~~~~~ By default ``bert-base-multilingual-cased`` model is used, which is pretrained on the top 104 languages with the largest Wikipedia using a masked language modeling (MLM) objective. If you want to use a different @@ -640,6 +653,18 @@ Refer to ``nlpaug`` `docs `__ and check `Textual augmenters` examples. +textaugment augmenter +~~~~~~~~~~~~~~~~~~~~~ +.. container:: jsphinx-download + + .. literalinclude:: _static/examples/recipes/augment_file_from_dir_4.py + :language: python + :lines: 5-7, 25- + + *See the full example* + :download:`here <_static/examples/recipes/augment_file_from_dir_4.py>` + + Using `raw=True` features in tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you pass ``raw=True`` argument to any provider or inner function, diff --git a/setup.py b/setup.py index 1dabd2d0..5d5dc58e 100644 --- a/setup.py +++ b/setup.py @@ -4,19 +4,19 @@ from setuptools import find_packages, setup -def clean_readme(text): +def clean_readme(text: str) -> str: # Pattern to match ":emphasize-lines:" followed by digits emphasize_lines_pattern = r":emphasize-lines: \d+" text = re.sub(emphasize_lines_pattern, "", text) - # Pattern to match ":name:" followed by any characters to the line end - name_lines_pattern = r":name: .*$" - text = re.sub(name_lines_pattern, "", text, flags=re.MULTILINE) + # # Pattern to match ":name:" followed by any characters to the line end + # name_lines_pattern = r":name: .*$" + # text = re.sub(name_lines_pattern, "", text, flags=re.MULTILINE) return text -version = "0.17.10" +version = "0.17.11" try: readme = open(os.path.join(os.path.dirname(__file__), "README.rst")).read() diff --git a/src/faker_file/__init__.py b/src/faker_file/__init__.py index 67c9af2e..cf743535 100644 --- a/src/faker_file/__init__.py +++ b/src/faker_file/__init__.py @@ -1,5 +1,5 @@ __title__ = "faker_file" -__version__ = "0.17.10" +__version__ = "0.17.11" __author__ = "Artur Barseghyan " __copyright__ = "2022-2023 Artur Barseghyan" __license__ = "MIT"