diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 89ad7128a..8a6a283b0 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -189,7 +189,7 @@ def __init__(self, path, *args): def read_file(self): path, name = os.path.split(self.filename) - self.filename = os.path.join(path, quote(name)) + self.filename = f"{path}/{quote(name)}" self.filename = self._trim(self._resolve_redirects(self.filename)) with contextlib.closing(self.urlopen(self.filename)) as response: name = self._suggest_filename( diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 97656c543..214af5ba5 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -10,6 +10,7 @@ import warnings import logging import traceback +from urllib.parse import urlparse from types import SimpleNamespace as namespace from concurrent.futures._base import TimeoutError @@ -22,11 +23,14 @@ from AnyQt.QtWidgets import ( QAction, QPushButton, QComboBox, QApplication, QStyle, QFileDialog, QFileIconProvider, QStackedWidget, QProgressBar, QWidget, QHBoxLayout, - QVBoxLayout, QLabel + QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter ) +from orangewidget.utils.itemmodels import PyListModel + from Orange.data import Table, Domain, StringVariable from Orange.widgets import widget, gui, settings +from Orange.widgets.data.owfile import LineEditSelectOnFocus from Orange.widgets.utils.filedialogs import RecentPath from Orange.widgets.utils.concurrent import ( ThreadExecutor, FutureWatcher, methodinvoke @@ -91,9 +95,12 @@ class Outputs: data = Output("Corpus", Corpus) skipped_documents = Output("Skipped documents", Table) + LOCAL_FILE, URL = range(2) + source = settings.Setting(LOCAL_FILE) #: list of recent paths recent_paths: List[RecentPath] = settings.Setting([]) currentPath: Optional[str] = settings.Setting(None) + recent_urls: List[str] = settings.Setting([]) want_main_area = False resizing_enabled = False @@ -116,8 +123,18 @@ def __init__(self): self.__invalidated = False self.__pendingTask = None - vbox = gui.vBox(self.controlArea) - hbox = gui.hBox(vbox) + layout = QGridLayout() + layout.setSpacing(4) + gui.widgetBox(self.controlArea, orientation=layout, box='Source') + source_box = gui.radioButtons(None, self, "source", box=True, + callback=self.start, addToLayout=False) + rb_button = gui.appendRadioButton(source_box, "Folder:", + addToLayout=False) + layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) + + box = gui.hBox(None, addToLayout=False, margin=0) + box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) + self.recent_cb = QComboBox( sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon, minimumContentsLength=16, @@ -148,25 +165,50 @@ def __init__(self): browseaction.iconText(), icon=browseaction.icon(), toolTip=browseaction.toolTip(), - clicked=browseaction.trigger + clicked=browseaction.trigger, + default=False, + autoDefault=False, ) reloadbutton = QPushButton( reloadaction.iconText(), icon=reloadaction.icon(), clicked=reloadaction.trigger, - default=True, + default=False, + autoDefault=False, ) - - hbox.layout().addWidget(self.recent_cb) - hbox.layout().addWidget(browsebutton) - hbox.layout().addWidget(reloadbutton) + box.layout().addWidget(self.recent_cb) + layout.addWidget(box, 0, 1) + layout.addWidget(browsebutton, 0, 2) + layout.addWidget(reloadbutton, 0, 3) + + rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False) + layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) + + self.url_combo = url_combo = QComboBox() + url_model = PyListModel() + url_model.wrap(self.recent_urls) + url_combo.setLineEdit(LineEditSelectOnFocus()) + url_combo.setModel(url_model) + url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed) + url_combo.setEditable(True) + url_combo.setInsertPolicy(url_combo.InsertAtTop) + url_edit = url_combo.lineEdit() + l, t, r, b = url_edit.getTextMargins() + url_edit.setTextMargins(l + 5, t, r, b) + layout.addWidget(url_combo, 3, 1, 1, 3) + url_combo.activated.connect(self._url_set) + # whit completer we set that combo box is case sensitive when + # matching the history + completer = QCompleter() + completer.setCaseSensitivity(Qt.CaseSensitive) + url_combo.setCompleter(completer) self.addActions([browseaction, reloadaction]) reloadaction.changed.connect( lambda: reloadbutton.setEnabled(reloadaction.isEnabled()) ) - box = gui.vBox(vbox, "Info") + box = gui.vBox(self.controlArea, "Info") self.infostack = QStackedWidget() self.info_area = QLabel( @@ -179,6 +221,8 @@ def __init__(self): self.cancel_button = QPushButton( "Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), + default=False, + autoDefault=False, ) self.cancel_button.clicked.connect(self.cancel) @@ -210,6 +254,17 @@ def __init__(self): QApplication.postEvent(self, QEvent(RuntimeEvent.Init)) + def _url_set(self): + url = self.url_combo.currentText() + pos = self.recent_urls.index(url) + url = url.strip() + if not urlparse(url).scheme: + url = "http://" + url + self.url_combo.setItemText(pos, url) + self.recent_urls[pos] = url + self.source = self.URL + self.start() + def __initRecentItemsModel(self): if self.currentPath is not None and \ not os.path.isdir(self.currentPath): @@ -370,7 +425,7 @@ def setCurrentPath(self, path): if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE return success def addRecentPath(self, path): @@ -447,7 +502,7 @@ def reload(self): """ if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE self.corpus = None self.start() @@ -460,7 +515,9 @@ def start(self): self.progress_widget.setValue(0) self.__invalidated = False - if self.currentPath is None: + startdir = self.currentPath if self.source == self.LOCAL_FILE \ + else self.url_combo.currentText().strip() + if not startdir: return if self.__state == State.Processing: @@ -470,14 +527,13 @@ def start(self): .format(self.__pendingTask.startdir)) self.cancel() - startdir = self.currentPath - self.__setRuntimeState(State.Processing) report_progress = methodinvoke( self, "__onReportProgress", (object,)) - task = ImportDocuments(startdir, report_progress=report_progress) + task = ImportDocuments(startdir, self.source == self.URL, + report_progress=report_progress) # collect the task state in one convenient place self.__pendingTask = taskstate = namespace( diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py index 3600b04d7..dd7bca5c6 100644 --- a/orangecontrib/text/widgets/tests/test_owimportdocuments.py +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -92,6 +92,19 @@ def test_load_empty_folder(self): self.wait_until_finished(widget=widget) self.assertIsNone(self.get_output(widget.Outputs.data)) + @unittest.skip("Due to timeout") + def test_load_from_url(self): + url = "http://file.biolab.si/text-semantics/data/semeval/" + self.widget.recent_urls = [url] + self.widget.url_combo.setCurrentText(url) + self.widget._url_set() + self.wait_until_finished(timeout=20000) + corpus = self.get_output(self.widget.Outputs.data) + self.assertEqual(len(corpus), 100) + skipped = self.get_output(self.widget.Outputs.skipped_documents) + self.assertIsNone(skipped) + + if __name__ == "__main__": unittest.main() diff --git a/requirements.txt b/requirements.txt index 5f26f69be..b3075ba02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy python-dateutil<3.0.0 # denpendency for botocore gensim>=0.12.3 # LDA's show topics unified in 0.12.3 setuptools-git -Orange3 >=3.25.0 +Orange3 >=3.28.0 tweepy beautifulsoup4 simhash >=1.11