diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 89ad7128a..9ab427438 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -188,8 +188,7 @@ def __init__(self, path, *args): Reader.__init__(self, self.filename, *args) def read_file(self): - path, name = os.path.split(self.filename) - self.filename = os.path.join(path, quote(name)) + self.filename = quote(self.filename, safe="/:") self.filename = self._trim(self._resolve_redirects(self.filename)) with contextlib.closing(self.urlopen(self.filename)) as response: name = self._suggest_filename( @@ -216,6 +215,10 @@ def __init__(self, startdir: str, is_url: bool = False, formats: Tuple[str] = DefaultFormats, report_progress: Callable = None): + if is_url and not startdir.endswith("/"): + startdir += "/" + elif not is_url: + startdir = os.path.join(startdir, "") self.startdir = startdir self.formats = formats self._report_progress = report_progress @@ -394,7 +397,7 @@ def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), include_patterns = include_patterns or ("*",) paths = [] for filename in files: - path = os.path.join(topdir, os.path.join(*filename)) + path = topdir + "/".join(filename) if matches_any(path, include_patterns) and \ not matches_any(path, exclude_patterns): paths.append(path) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 9f8a553d2..179388827 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import patch +import numpy as np import pandas as pd from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ @@ -56,19 +57,19 @@ def test_scan_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path) - self.assertEqual(len(paths), 101) + self.assertGreater(len(paths), 0) def test_scan_url_txt(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.txt"]) - self.assertEqual(len(paths), 100) + self.assertGreater(len(paths), 0) def test_scan_url_csv(self): path = "http://file.biolab.si/text-semantics/data/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.csv"]) - self.assertEqual(len(paths), 6) + self.assertGreater(len(paths), 0) def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" @@ -77,8 +78,8 @@ def test_read_meta_data_url(self): self.assertIsInstance(data1, pd.DataFrame) self.assertEqual(len(err), 0) - @patch("orangecontrib.text.import_documents.ImportDocuments." - "META_DATA_FILE_KEY", "File") + # @patch("orangecontrib.text.import_documents.ImportDocuments." + # "META_DATA_FILE_KEY", "File") def test_merge_metadata_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) @@ -89,24 +90,51 @@ def test_merge_metadata_url(self): importer._meta_data = meta_data[:50] corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) - self.assertEqual(len(corpus), 4) - columns = ["name", "path", "content", "Content", "File", "Keywords"] + self.assertGreater(len(corpus), 0) + columns = ["name", "path", "content", "Content", + "Text file", "Keywords"] self.assertEqual([v.name for v in corpus.domain.metas], columns) importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' importer._meta_data = None corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) - self.assertEqual(len(corpus), 4) + self.assertGreater(len(corpus), 0) columns = ["name", "path", "content"] self.assertEqual([v.name for v in corpus.domain.metas], columns) def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample/" + importer = ImportDocuments(path, True) + corpus1, _ = importer.run() + self.assertGreater(len(corpus1), 0) + + mask = np.ones_like(corpus1.metas, dtype=bool) + mask[:, 1] = False + + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample////" + importer = ImportDocuments(path, True) + corpus2, _ = importer.run() + self.assertGreater(len(corpus1), 0) + self.assertEqual(corpus1.metas[mask].tolist(), + corpus2.metas[mask].tolist()) + + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample" + importer = ImportDocuments(path, True) + corpus3, _ = importer.run() + self.assertGreater(len(corpus2), 0) + self.assertEqual(corpus1.metas[mask].tolist(), + corpus3.metas[mask].tolist()) + + def test_run_url_special_characters(self): path = "http://file.biolab.si/text-semantics/data/" \ "elektrotehniski-vestnik-clanki/" importer = ImportDocuments(path, True) corpus, errors = importer.run() - self.assertEqual(len(corpus), 382) + self.assertGreater(len(corpus), 0) if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 97656c543..3a7624e14 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -10,6 +10,7 @@ import warnings import logging import traceback +from urllib.parse import urlparse from types import SimpleNamespace as namespace from concurrent.futures._base import TimeoutError @@ -22,11 +23,14 @@ from AnyQt.QtWidgets import ( QAction, QPushButton, QComboBox, QApplication, QStyle, QFileDialog, QFileIconProvider, QStackedWidget, QProgressBar, QWidget, QHBoxLayout, - QVBoxLayout, QLabel + QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter ) +from orangewidget.utils.itemmodels import PyListModel + from Orange.data import Table, Domain, StringVariable from Orange.widgets import widget, gui, settings +from Orange.widgets.data.owfile import LineEditSelectOnFocus from Orange.widgets.utils.filedialogs import RecentPath from Orange.widgets.utils.concurrent import ( ThreadExecutor, FutureWatcher, methodinvoke @@ -91,9 +95,12 @@ class Outputs: data = Output("Corpus", Corpus) skipped_documents = Output("Skipped documents", Table) + LOCAL_FILE, URL = range(2) + source = settings.Setting(LOCAL_FILE) #: list of recent paths recent_paths: List[RecentPath] = settings.Setting([]) currentPath: Optional[str] = settings.Setting(None) + recent_urls: List[str] = settings.Setting([]) want_main_area = False resizing_enabled = False @@ -116,8 +123,18 @@ def __init__(self): self.__invalidated = False self.__pendingTask = None - vbox = gui.vBox(self.controlArea) - hbox = gui.hBox(vbox) + layout = QGridLayout() + layout.setSpacing(4) + gui.widgetBox(self.controlArea, orientation=layout, box='Source') + source_box = gui.radioButtons(None, self, "source", box=True, + callback=self.start, addToLayout=False) + rb_button = gui.appendRadioButton(source_box, "Folder:", + addToLayout=False) + layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) + + box = gui.hBox(None, addToLayout=False, margin=0) + box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) + self.recent_cb = QComboBox( sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon, minimumContentsLength=16, @@ -148,25 +165,50 @@ def __init__(self): browseaction.iconText(), icon=browseaction.icon(), toolTip=browseaction.toolTip(), - clicked=browseaction.trigger + clicked=browseaction.trigger, + default=False, + autoDefault=False, ) reloadbutton = QPushButton( reloadaction.iconText(), icon=reloadaction.icon(), clicked=reloadaction.trigger, - default=True, + default=False, + autoDefault=False, ) - - hbox.layout().addWidget(self.recent_cb) - hbox.layout().addWidget(browsebutton) - hbox.layout().addWidget(reloadbutton) + box.layout().addWidget(self.recent_cb) + layout.addWidget(box, 0, 1) + layout.addWidget(browsebutton, 0, 2) + layout.addWidget(reloadbutton, 0, 3) + + rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False) + layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) + + self.url_combo = url_combo = QComboBox() + url_model = PyListModel() + url_model.wrap(self.recent_urls) + url_combo.setLineEdit(LineEditSelectOnFocus()) + url_combo.setModel(url_model) + url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed) + url_combo.setEditable(True) + url_combo.setInsertPolicy(url_combo.InsertAtTop) + url_edit = url_combo.lineEdit() + l, t, r, b = url_edit.getTextMargins() + url_edit.setTextMargins(l + 5, t, r, b) + layout.addWidget(url_combo, 3, 1, 1, 3) + url_combo.activated.connect(self._url_set) + # whit completer we set that combo box is case sensitive when + # matching the history + completer = QCompleter() + completer.setCaseSensitivity(Qt.CaseSensitive) + url_combo.setCompleter(completer) self.addActions([browseaction, reloadaction]) reloadaction.changed.connect( lambda: reloadbutton.setEnabled(reloadaction.isEnabled()) ) - box = gui.vBox(vbox, "Info") + box = gui.vBox(self.controlArea, "Info") self.infostack = QStackedWidget() self.info_area = QLabel( @@ -179,6 +221,8 @@ def __init__(self): self.cancel_button = QPushButton( "Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), + default=False, + autoDefault=False, ) self.cancel_button.clicked.connect(self.cancel) @@ -210,6 +254,17 @@ def __init__(self): QApplication.postEvent(self, QEvent(RuntimeEvent.Init)) + def _url_set(self): + url = self.url_combo.currentText() + pos = self.recent_urls.index(url) + url = url.strip() + if not urlparse(url).scheme: + url = "http://" + url + self.url_combo.setItemText(pos, url) + self.recent_urls[pos] = url + self.source = self.URL + self.start() + def __initRecentItemsModel(self): if self.currentPath is not None and \ not os.path.isdir(self.currentPath): @@ -336,7 +391,8 @@ def setCurrentPath(self, path): """ if self.currentPath is not None and path is not None and \ os.path.isdir(self.currentPath) and os.path.isdir(path) and \ - os.path.samefile(self.currentPath, path): + os.path.samefile(self.currentPath, path) and \ + self.source == self.LOCAL_FILE: return True success = True @@ -370,7 +426,7 @@ def setCurrentPath(self, path): if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE return success def addRecentPath(self, path): @@ -447,7 +503,7 @@ def reload(self): """ if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE self.corpus = None self.start() @@ -460,7 +516,9 @@ def start(self): self.progress_widget.setValue(0) self.__invalidated = False - if self.currentPath is None: + startdir = self.currentPath if self.source == self.LOCAL_FILE \ + else self.url_combo.currentText().strip() + if not startdir: return if self.__state == State.Processing: @@ -470,14 +528,13 @@ def start(self): .format(self.__pendingTask.startdir)) self.cancel() - startdir = self.currentPath - self.__setRuntimeState(State.Processing) report_progress = methodinvoke( self, "__onReportProgress", (object,)) - task = ImportDocuments(startdir, report_progress=report_progress) + task = ImportDocuments(startdir, self.source == self.URL, + report_progress=report_progress) # collect the task state in one convenient place self.__pendingTask = taskstate = namespace( diff --git a/requirements.txt b/requirements.txt index ceb496590..50a3ee91c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy python-dateutil<3.0.0 # denpendency for botocore gensim>=0.12.3 # LDA's show topics unified in 0.12.3 setuptools-git -Orange3 >=3.25.0 +Orange3 >=3.28.0 tweepy beautifulsoup4 simhash >=1.11