Skip to content

Commit

Permalink
Merge pull request #5 from INGEOTEC/develop
Browse files Browse the repository at this point in the history
Version - 0.0.1
  • Loading branch information
mgraffg authored Jun 5, 2024
2 parents 23e4cff + 64e7e10 commit 3f5518a
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 9 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/pip.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ jobs:
shell: bash -l {0}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, windows-latest]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2
uses: conda-incubator/setup-miniconda@v3
with:
activate-environment: test
auto-update-conda: true
Expand All @@ -32,7 +32,7 @@ jobs:
run: |
conda install --yes pip
pip install twine build
conda install --yes numpy scipy scikit-learn nose
conda install --yes numpy scipy scikit-learn nose evomsa
python -m build
- name: Pip
if: ${{ runner.os == 'Linux' }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
shell: bash -l {0}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, windows-latest]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
Expand All @@ -32,7 +32,7 @@ jobs:
conda install --yes pip
pip install coverage
pip install coveralls
conda install --yes numpy scipy scikit-learn nose
conda install --yes numpy scipy scikit-learn nose evomsa
python setup.py build_ext --inplace
- name: Tests on Linux
if: ${{ runner.os == 'Linux' }}
Expand Down
4 changes: 3 additions & 1 deletion dialectid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__version__ = '0.0.1'
__version__ = '0.0.1'

from dialectid.text_repr import BoW
34 changes: 34 additions & 0 deletions dialectid/tests/test_text_repr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# MIT License

# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/


from dialectid.text_repr import BoW


def test_bow():
"""Test BoW"""
from b4msa.textmodel import TextModel

bow = BoW(lang='es')
assert isinstance(bow.bow, TextModel)
X = bow.transform(['Buenos dias'])
7 changes: 7 additions & 0 deletions dialectid/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,10 @@ def test_countries():
for i in v:
assert len(i) == 2

def test_load_bow():
"""Test load_bow"""

from microtc.utils import Counter

c = utils.load_bow()
assert isinstance(c, Counter)
69 changes: 69 additions & 0 deletions dialectid/text_repr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# MIT License

# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from sklearn.svm import LinearSVC
from EvoMSA import BoW as EvoMSABoW
from EvoMSA.utils import b4msa_params
from b4msa.textmodel import TextModel
from microtc.weighting import TFIDF
from dialectid.utils import load_bow


class BoW(EvoMSABoW):
"""BoW
>>> from dialectid import BoW
>>> bow = BoW(lang='es')
>>> bow.transform(['Buenos dias', 'Disfruta dialectid'])
"""

def __init__(self, pretrain: bool=True,
v1: bool=False,
estimator_class=LinearSVC,
estimator_kwargs=dict(dual=True,
class_weight='balanced'),
**kwargs):
assert pretrain
assert not v1
super(BoW, self).__init__(pretrain=pretrain,
v1=v1, **kwargs)

@property
def bow(self):
"""BoW"""

try:
bow = self._bow
except AttributeError:
freq = load_bow(lang=self.lang,
d=self.voc_size_exponent,
func=self.voc_selection)
params = b4msa_params(lang=self.lang,
dim=self._voc_size_exponent)
params.update(self.b4msa_kwargs)
bow = TextModel(**params)
tfidf = TFIDF()
tfidf.N = freq.update_calls
tfidf.word2id, tfidf.wordWeight = tfidf.counter2weight(freq)
bow.model = tfidf
self._bow = bow
return bow
34 changes: 32 additions & 2 deletions dialectid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@
# SOFTWARE.
# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/

from EvoMSA.utils import Download
from microtc.utils import Counter
from os.path import join, dirname, isdir, isfile
import gzip
import os

BASEURL = 'https://github.com/INGEOTEC/dialectid/releases/download/data'

COUNTRIES = {'es':['mx', 'cl', 'es', # Mexico (MX), Chile (CL), Spain (ES)
'ar', 'co', 'pe', # Argentina (AR), Colombia (CO), Peru (PE)
Expand Down Expand Up @@ -91,5 +98,28 @@
'zh':['cn', 'sg', 'hk', # China, Singapore, Hong Kong
'tw' # Taiwan
]
}

}


def load_bow(lang='es', d=17, func='most_common_by_type'):
"""Load BoW model from dialectid"""

def load(filename):
try:
with gzip.open(filename, 'rb') as fpt:
return str(fpt.read(), encoding='utf-8')
except Exception:
os.unlink(filename)
raise Exception(filename)

lang = lang.lower().strip()
diroutput = join(dirname(__file__), 'models')
if not isdir(diroutput):
os.mkdir(diroutput)
filename = f'{lang}_bow_{func}_{d}.json.gz'
url = f'{BASEURL}/{filename}'
output = join(diroutput, filename)
if not isfile(output):
Download(url, output)
return Counter.fromjson(load(output))

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
name = 'dialectid'
dependencies = [
'numpy',
'scikit-learn>=1.3.0'
'scikit-learn>=1.3.0',
'microtc',
'b4msa',
'EvoMSA'
]
dynamic = ['version']

Expand Down

0 comments on commit 3f5518a

Please sign in to comment.