From f53e89a97d237e38b94b9d1f9532b36bc69cd43c Mon Sep 17 00:00:00 2001 From: Markos Volikas <115570991+mvolikas@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:00:58 +0200 Subject: [PATCH] #620 Add support for shards - SolrSpout (#1343) * #620 update spout to fetch from the corresponding shard * #620 add Solr scripts * #620 fix tests to operate in cloud mode * #620 fix code format * #620 add Solr spout test * #620 add license * #620 improve the Solr related scripts * #620 add solr archetype, update readmes * #620 minor fixes * #620 do not set the 'shard' query parameter when we have a single shard * #620 fix archetype includes, improve scripts and configuration files * #620 fix java topologies * #620 add 'injection.flux' topology * #620 bring in change from #1390 * #620 update sample flux topologies and readme * #620 minor comments and readme changes --- external/solr/README.md | 119 ++---------- external/solr/archetype/pom.xml | 72 ++++++++ .../META-INF/archetype-post-generate.groovy | 21 +++ .../META-INF/maven/archetype-metadata.xml | 84 +++++++++ .../resources/archetype-resources/README.md | 143 ++++++++++++++ .../archetype-resources/clear-collections.sh} | 18 +- .../resources/archetype-resources/configsets | 1 + .../archetype-resources/crawler-conf.yaml | 160 ++++++++++++++++ .../archetype-resources/crawler.flux | 109 +++++++++++ .../archetype-resources/injection.flux | 50 +++++ .../resources/archetype-resources/pom.xml | 154 ++++++++++++++++ .../resources/archetype-resources/seeds.txt | 1 + .../archetype-resources/setup-solr.sh | 71 +++++++ .../archetype-resources}/solr-conf.yaml | 16 +- .../main/resources/default-regex-filters.txt | 32 ++++ .../resources/default-regex-normalizers.xml | 78 ++++++++ .../src/main/resources/jsoupfilters.json | 27 +++ .../src/main/resources/parsefilters.json | 23 +++ .../src/main/resources/urlfilters.json | 60 ++++++ .../docs/conf/schema.xml | 6 +- .../docs/conf/solrconfig.xml | 2 +- .../docs/conf/stopwords.txt | 0 .../docs/conf/synonyms.txt | 0 .../metrics/conf/schema.xml | 6 +- .../metrics/conf/solrconfig.xml | 2 +- .../status/conf/schema.xml | 6 +- .../status/conf/solrconfig.xml | 2 +- external/solr/cores/docs/core.properties | 20 -- external/solr/cores/metrics/core.properties | 20 -- external/solr/cores/solr.xml | 20 -- .../stormcrawler/solr/SeedInjector.java | 57 ------ .../stormcrawler/solr/SolrCrawlTopology.java | 70 ------- .../solr/persistence/SolrSpout.java | 39 +++- .../solr/persistence/IndexerBoltTest.java | 4 +- .../solr/persistence/SolrContainerTest.java | 32 +++- .../solr/persistence/SpoutTest.java | 174 ++++++++++++++++++ .../solr/persistence/StatusBoltTest.java | 4 +- pom.xml | 3 +- 38 files changed, 1379 insertions(+), 327 deletions(-) create mode 100644 external/solr/archetype/pom.xml create mode 100644 external/solr/archetype/src/main/resources/META-INF/archetype-post-generate.groovy create mode 100644 external/solr/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/README.md rename external/solr/{cores/status/core.properties => archetype/src/main/resources/archetype-resources/clear-collections.sh} (67%) mode change 100644 => 100755 create mode 120000 external/solr/archetype/src/main/resources/archetype-resources/configsets create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/crawler-conf.yaml create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/crawler.flux create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/injection.flux create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/pom.xml create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/seeds.txt create mode 100755 external/solr/archetype/src/main/resources/archetype-resources/setup-solr.sh rename external/solr/{ => archetype/src/main/resources/archetype-resources}/solr-conf.yaml (92%) create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json create mode 100644 external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json rename external/solr/{cores => configsets}/docs/conf/schema.xml (93%) rename external/solr/{cores => configsets}/docs/conf/solrconfig.xml (96%) rename external/solr/{cores => configsets}/docs/conf/stopwords.txt (100%) rename external/solr/{cores => configsets}/docs/conf/synonyms.txt (100%) rename external/solr/{cores => configsets}/metrics/conf/schema.xml (91%) rename external/solr/{cores => configsets}/metrics/conf/solrconfig.xml (97%) rename external/solr/{cores => configsets}/status/conf/schema.xml (89%) rename external/solr/{cores => configsets}/status/conf/solrconfig.xml (96%) delete mode 100644 external/solr/cores/docs/core.properties delete mode 100644 external/solr/cores/metrics/core.properties delete mode 100644 external/solr/cores/solr.xml delete mode 100644 external/solr/src/main/java/org/apache/stormcrawler/solr/SeedInjector.java delete mode 100644 external/solr/src/main/java/org/apache/stormcrawler/solr/SolrCrawlTopology.java create mode 100644 external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SpoutTest.java diff --git a/external/solr/README.md b/external/solr/README.md index 7eed8d560..2a93ba3f1 100644 --- a/external/solr/README.md +++ b/external/solr/README.md @@ -1,117 +1,30 @@ -stormcrawler-solr -================== +# stormcrawler-solr -Set of Solr resources for StormCrawler that allows you to create topologies that consume from a Solr collection and store metrics, status or parsed content into Solr. +Set of [Apache Solr](https://solr.apache.org/) resources for StormCrawler that allows you to create topologies that consume from a Solr collection and store metrics, status or parsed content into Solr. -## How to use +## Getting started -In your project you can use this by adding the following dependency: +The easiest way is currently to use the archetype for Solr with: -```xml - - org.apache.stormcrawler - stormcrawler-solr - ${stormcrawler.version} - -``` +`mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler -DarchetypeArtifactId=stormcrawler-solr-archetype -DarchetypeVersion=3.1.1-SNAPSHOT` -## Available resources - -* `IndexerBolt`: Implementation of `AbstractIndexerBolt` that allows to index the parsed data and metadata into a specified Solr collection. - -* `MetricsConsumer`: Class that allows to store Storm metrics in Solr. - -* `SolrSpout`: Spout that allows to get URLs from a specified Solr collection. - -* `StatusUpdaterBolt`: Implementation of `AbstractStatusUpdaterBolt` that allows to store the status of each URL along with the serialized metadata in Solr. - -* `SolrCrawlTopology`: Example implementation of a topology that use the provided classes, this is intended as an example or a guide on how to use this resources. - -* `SeedInjector`: Topology that allow to read URLs from a specified file and store the URLs in a Solr collection using the `StatusUpdaterBolt`. This can be used as a starting point to inject URLs into Solr. - -## Configuration options - -The available configuration options can be found in the [`solr-conf.yaml`](solr-conf.yaml) file. - -For configuring the connection with the Solr server, the following parameters are available: `solr.TYPE.url`, `solr.TYPE.zkhost`, `solr.TYPE.collection`. - -> In the previous example `TYPE` can be one of the following values: - -> * `indexer`: To reference the configuration parameters of the `IndexerBolt` class. -> * `status`: To reference the configuration parameters of the `SolrSpout` and `StatusUpdaterBolt` classes. -> * `metrics`: To reference the configuration parameters of the `MetricsConsumer` class. - -> *Note: Some of this classes provide additional parameter configurations.* - -### General parameters - -* `solr.TYPE.url`: The URL of the Solr server including the name of the collection that you want to use. - -## Additional configuration options - -#### MetricsConsumer - -In the case of the `MetricsConsumer` class a couple of additional configuration parameters are provided to use the [Document Expiration](https://lucidworks.com/blog/document-expiration/) feature available in Solr since version 4.8. +You'll be asked to enter a groupId (e.g. com.mycompany.crawler), an artefactId (e.g. stormcrawler), a version, a package name and details about the user agent to use. -* `solr.metrics.ttl`: [Date expression](https://cwiki.apache.org/confluence/display/solr/Working+with+Dates) to specify when the document should expire. -* `solr.metrics.ttl.field`: Field to be used to specify the [date expression](https://cwiki.apache.org/confluence/display/solr/Working+with+Dates) that defines when the document should expire. +This will not only create a fully formed project containing a POM with the dependency above but also a set of resources, configuration files and sample topology classes. Enter the directory you just created (should be the same as the artefactId you specified earlier) and follow the instructions on the README file. -*Note: The date expression specified in the `solr.metrics.ttl` parameter is not validated. To use this feature some changes in the Solr configuration must be done.* +You will of course need to have both Apache Storm (2.7.0) and Apache Solr (9.7.0) installed. -#### SolrSpout +Official references: +* [Apache Storm: Setting Up a Development Environment](https://storm.apache.org/releases/current/Setting-up-development-environment.html) +* [Apache Solr: Installation & Deployment](https://solr.apache.org/guide/solr/latest/deployment-guide/installing-solr.html) -For the `SolrSpout` class a couple of additional configuration parameters are available to guarantee some *diversity* in the URLs fetched from Solr, in the case that you want to have better coverage of your URLs. This is done using the [collapse and expand](https://cwiki.apache.org/confluence/display/solr/Collapse+and+Expand+Results) feature available in Solr. - -* `solr.status.bucket.field`: Field to be used to collapse the documents. -* `solr.status.bucket.maxsize`: Amount of documents to return for each *bucket*. - -For instance if you are crawling URLs from different domains, perhaps is of your interest to *balance* the amount of URLs to be processed from each domain, instead of crawling all the available URLs from one domain and then the other. - -For this scenario you'll want to collapse on the `host` field (that already is indexed by the `StatusUpdaterBolt`) and perhaps you just want to crawl 100 URLs per domain. For this case is enough to add this to your configuration: - -```yaml -solr.status.bucket.field: host -solr.status.bucket.maxsize: 100 -``` - -This feature can be combined with the [partition features](https://github.com/apache/incubator-stormcrawler/wiki/Configuration#fetching-and-partitioning) provided by StormCrawler to balance the crawling process and not just the URL coverage. - -### Metadata - -The metadata associated with each URL is also persisted in the Solr collection configured. By default the metadata is stored as separated fields in the collection using a prefix that can be configured using the `solr.status.metadata.prefix` option. If no value is supplied for this option the `metadata` value is used. Take a look at the following example record: - -```json -{ - "url": "http://test.com", - "host": "test.com", - "status": "DISCOVERED", - "metadata.url.path": "http://test.com", - "metadata.depth": "1", - "nextFetchDate": "2015-10-30T17:26:34.386Z" -} -``` - -In the previous example the `metadata.url.path` and `metadata.depth` attributes are elements taken from the `metadata` object. If the `SolrSpout` class is used to fetch URLs from Solr, the configured prefix (`metadata.` in this case) is stripped before populating the `Metadata` instance. - -## Using SolrCloud - -To use a SolrCloud cluster instead of a single Solr server, you must use the following configuration parameters **instead** of the `solr.TYPE.url`: - -* `solr.TYPE.zkhost`: URL of the Zookeeper host that holds the information regarding the SolrCloud cluster. - -* `solr.TYPE.collection`: Name of the collection that you wish to use. - -## Solr configuration - -An example collection configuration for each type of data is also provided in the [`cores`](cores) directory. The configuration is very basic but it will allow you to view all the stored data in Solr. - -The configuration is only useful as a testing resource, mainly because everything is stored as a `Solr.StrField` which is not very useful for search purposes. Numeric values and dates are also **stored as strings** using dynamic fields. +## Available resources -In the `metrics` collection an `id` field is configured to be populated with an auto-generated UUID for each document, this configuration is placed in the `solrconfig.xml` file. The `id` field will be used as the `uniqueKey`. +* [IndexerBolt](https://github.com/apache/incubator-stormcrawler/blob/main/external/solr/src/main/java/org/apache/stormcrawler/solr/bolt/IndexerBolt.java): Implementation of [AbstractIndexerBolt](https://github.com/apache/incubator-stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/indexing/AbstractIndexerBolt.java) that allows to index the parsed data and metadata into a specified Solr collection. -In the `parse` and `status` cores the `uniqueKey` is defined to be the `url` field. +* [MetricsConsumer](https://github.com/apache/incubator-stormcrawler/blob/main/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java): Class that allows to store Storm metrics in Solr. -Also keep in mind that depending on your needs you can use the [Schemaless Mode](https://cwiki.apache.org/confluence/display/solr/Schemaless+Mode) available in Solr. +* [SolrSpout](https://github.com/apache/incubator-stormcrawler/blob/main/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/SolrSpout.java): Spout that allows to get URLs from a specified Solr collection. -To start SOLR with the preconfigured cores for StormCrawler, you can do `bin/solr start -s stormcrawler/external/solr/cores`, then open the SOLR UI (http://localhost:8983) to check that they have been loaded correctly. Alternatively, create the cores (here `status`) by `bin/solr create -c status -d stormcrawler/external/solr/cores/status/`. +* [StatusUpdaterBolt](https://github.com/apache/incubator-stormcrawler/blob/main/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java): Implementation of [AbstractStatusUpdaterBolt](https://github.com/apache/incubator-stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java) that allows to store the status of each URL along with the serialized metadata in Solr. diff --git a/external/solr/archetype/pom.xml b/external/solr/archetype/pom.xml new file mode 100644 index 000000000..640adbc69 --- /dev/null +++ b/external/solr/archetype/pom.xml @@ -0,0 +1,72 @@ + + + + + + 4.0.0 + + + org.apache.stormcrawler + stormcrawler + 3.1.1-SNAPSHOT + ../../../pom.xml + + + stormcrawler-solr-archetype + + maven-archetype + + + + + + src/main/resources + true + + META-INF/maven/archetype-metadata.xml + + + + src/main/resources + false + + META-INF/maven/archetype-metadata.xml + + + + + + + org.apache.maven.archetype + archetype-packaging + 3.3.1 + + + + + + + maven-archetype-plugin + 3.3.1 + + + + + diff --git a/external/solr/archetype/src/main/resources/META-INF/archetype-post-generate.groovy b/external/solr/archetype/src/main/resources/META-INF/archetype-post-generate.groovy new file mode 100644 index 000000000..0b0d56a54 --- /dev/null +++ b/external/solr/archetype/src/main/resources/META-INF/archetype-post-generate.groovy @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +def file1 = new File(request.getOutputDirectory(), request.getArtifactId() + "/setup-solr.sh") +file1.setExecutable(true, false) + +def file2 = new File(request.getOutputDirectory(), request.getArtifactId() + "/clear-collections.sh") +file2.setExecutable(true, false) diff --git a/external/solr/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml b/external/solr/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml new file mode 100644 index 000000000..92eb5a68d --- /dev/null +++ b/external/solr/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml @@ -0,0 +1,84 @@ + + + + + + + + + ^[a-zA-Z_\-]+$ + + + + + + ^\S+@\S+\.\S+$ + + + ${project.version} + + + + + + src/main/java + + **/*.java + + + + src/main/resources + + **/*.xml + **/*.txt + **/*.yaml + **/*.json + + + + + + *.yaml + *.flux + seeds.txt + README.md + + + + + + configsets + setup-solr.sh + clear-collections.sh + + + + configsets + + **/* + + + + + diff --git a/external/solr/archetype/src/main/resources/archetype-resources/README.md b/external/solr/archetype/src/main/resources/archetype-resources/README.md new file mode 100644 index 000000000..5d193b87a --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/README.md @@ -0,0 +1,143 @@ +This has been generated by the StormCrawler Maven Archetype as a starting point for building your own crawler with [Apache Solr](https://solr.apache.org/) as a backend. +Have a look at the code and resources and modify them to your heart's content. + +You need to have Apache Storm (2.7.0) installed, as well as a running instance of Apache Solr (9.7.0). + +## Generated resources + +### Flux topologies + +- `injection.flux`: Topology that reads URLs from the _seeds.txt_ file and populates the `status` collection. +- `crawler.flux`: Basic topology that uses a `SolrSpout` as source and indexes parsed content into the `docs` collection. + +### Solr configuration file `solr-conf.yaml` + +For configuring the connection with the Solr server, the following parameters are available: `solr.TYPE.url`, `solr.TYPE.zkhost`, `solr.TYPE.collection`. + +In the previous example `TYPE` can be one of the following values: + +* `indexer`: To reference the configuration parameters of the `IndexerBolt` class. +* `status`: To reference the configuration parameters of the `SolrSpout` and `StatusUpdaterBolt` classes. +* `metrics`: To reference the configuration parameters of the `MetricsConsumer` class. + +> Note: Some of these classes provide additional parameter configurations. + +#### General parameters + +* `solr.TYPE.url`: The URL of the Solr server including the name of the collection that you want to use. + +#### MetricsConsumer + +In the case of the `MetricsConsumer` class a couple of additional configuration parameters are provided to use the [Document Expiration](https://lucidworks.com/blog/document-expiration/) feature available in Solr since version 4.8. + +* `solr.metrics.ttl`: [Date expression](https://cwiki.apache.org/confluence/display/solr/Working+with+Dates) to specify when the document should expire. +* `solr.metrics.ttl.field`: Field to be used to specify the [date expression](https://cwiki.apache.org/confluence/display/solr/Working+with+Dates) that defines when the document should expire. + +> Note: The date expression specified in the `solr.metrics.ttl` parameter is not validated. To use this feature some changes in the Solr configuration must be done. + +#### SolrSpout + +For the `SolrSpout` class a couple of additional configuration parameters are available to guarantee some *diversity* in the URLs fetched from Solr, in the case that you want to have better coverage of your URLs. This is done using the [collapse and expand](https://cwiki.apache.org/confluence/display/solr/Collapse+and+Expand+Results) feature available in Solr. + +* `solr.status.bucket.field`: Field to be used to collapse the documents. +* `solr.status.bucket.maxsize`: Amount of documents to return for each *bucket*. + +For instance if you are crawling URLs from different domains, perhaps is of your interest to *balance* the amount of URLs to be processed from each domain, instead of crawling all the available URLs from one domain and then the other. + +For this scenario you'll want to collapse on the `host` field (that already is indexed by the `StatusUpdaterBolt`) and perhaps you just want to crawl 100 URLs per domain. For this case is enough to add this to your configuration: + +```yaml +solr.status.bucket.field: host +solr.status.bucket.maxsize: 100 +``` + +This feature can be combined with the [partition features](https://github.com/apache/incubator-stormcrawler/wiki/Configuration#fetching-and-partitioning) provided by StormCrawler to balance the crawling process and not just the URL coverage. + +> It is recommended to use Solr in cloud mode. The following configuration options are available for distributing the `status` collection across multiple shards. +> * `solr.status.routing.fieldname`: Field to be used for routing documents to different shards. The values depend on the `partition.url.mode` (`byHost`, `byDomain`, `byIP`) +> * `solr.status.routing.shards`: Number of shards for the `status` collection + +#### Metadata + +The metadata associated with each URL is also persisted in the Solr collection configured. By default the metadata is stored as separated fields in the collection using a prefix that can be configured using the `solr.status.metadata.prefix` option. If no value is supplied for this option the `metadata` value is used. Take a look at the following example record: + +```json +{ + "url": "http://test.com", + "host": "test.com", + "status": "DISCOVERED", + "key": "test.com", + "metadata.url.path": "http://test.com", + "metadata.depth": "1", + "nextFetchDate": "2015-10-30T17:26:34.386Z" +} +``` + +In the previous example the `metadata.url.path` and `metadata.depth` attributes are elements taken from the `metadata` object. If the `SolrSpout` class is used to fetch URLs from Solr, the configured prefix (`metadata.` in this case) is stripped before populating the `Metadata` instance. + +#### Using SolrCloud + +To use a SolrCloud cluster instead of a single Solr server, you must use the following configuration parameters **instead** of the `solr.TYPE.url`: + +* `solr.TYPE.zkhost`: URL of the Zookeeper host that holds the information regarding the SolrCloud cluster. + +* `solr.TYPE.collection`: Name of the collection that you wish to use. + +### Collection configuration files + +An example collection configuration for each type of data is also provided in the [`configsets`](configsets) directory. The configuration is very basic but it will allow you to view all the stored data in Solr. + +The configuration is only useful as a testing resource, mainly because everything is stored as a `Solr.StrField` which is not very useful for search purposes. Numeric values and dates are also **stored as strings** using dynamic fields. + +In the `metrics` collection an `id` field is configured to be populated with an auto-generated UUID for each document, this configuration is placed in the `solrconfig.xml` file. The `id` field will be used as the `uniqueKey`. + +In the `parse` and `status` cores the `uniqueKey` is defined to be the `url` field. + +Also keep in mind that depending on your needs you can use the [Schemaless Mode](https://cwiki.apache.org/confluence/display/solr/Schemaless+Mode) available in Solr. + +### Solr scripts + +* `setup-solr.sh`: Starts Solr in cloud mode, uploads the configsets and creates the collections. +* `clear-collections.sh`: Deletes all the documents from the collections. + +## Bootstraping Solr + +First start Solr with the preconfigured collections for StormCrawler: + +```sh +./setup-solr.sh +``` + +Then open the Solr admin UI (http://localhost:8983) to check that they have been loaded correctly. + +## Building + +Generate an uberjar with: + +``` sh +mvn clean package +``` + +## Running a topology + +The first step consists of creating a file _seeds.txt_ in the current directory and populating it with the URLs +to be used as a starting point for the crawl, e.g. + +`echo "https://stormcrawler.apache.org/" > seeds.txt` + +You can start the crawl topology in local mode using the URLs in _seeds.txt_ as a starting point with + +``` sh +storm local target/${artifactId}-${version}.jar org.apache.storm.flux.Flux injection.flux --local-ttl 3600 +``` + +Note that in local mode, Flux uses a default TTL for the topology of 20 secs. The command above runs the topology for 1 hour. + +To start crawling, run the following command + +``` sh +storm jar target/${artifactId}-${version}.jar org.apache.storm.flux.Flux crawler.flux +``` + +Note that in the previous command, we ran the topology with `storm jar` to benefit from the Storm UI and logging. In that case, the topology runs continuously, as intended. +If you don't have a Storm cluster set up and/or want to run in local mode, simply replace _jar_ with _local_ and add _--local-ttl 3600_. diff --git a/external/solr/cores/status/core.properties b/external/solr/archetype/src/main/resources/archetype-resources/clear-collections.sh old mode 100644 new mode 100755 similarity index 67% rename from external/solr/cores/status/core.properties rename to external/solr/archetype/src/main/resources/archetype-resources/clear-collections.sh index 5e1caaca3..63f1cf046 --- a/external/solr/cores/status/core.properties +++ b/external/solr/archetype/src/main/resources/archetype-resources/clear-collections.sh @@ -12,9 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Written by CorePropertiesLocator -#Sun Jun 14 04:10:47 CDT 2015 -name=status -config=solrconfig.xml -schema=schema.xml -dataDir=data + +#!/bin/bash + +collections=("docs" "metrics" "status") + +for collection in "${collections[@]}"; do + solr_url="http://localhost:8983/solr/$collection/update?commit=true" + + echo -e "\n\e[1mDeleting all documents from collection: $collection ...\e[0m" + + curl -X POST -H 'Content-Type: application/json' --data-binary '{"delete": {"query": "*:*"}}' "$solr_url" +done diff --git a/external/solr/archetype/src/main/resources/archetype-resources/configsets b/external/solr/archetype/src/main/resources/archetype-resources/configsets new file mode 120000 index 000000000..7e0914a99 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/configsets @@ -0,0 +1 @@ +../../../../../configsets/ \ No newline at end of file diff --git a/external/solr/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/solr/archetype/src/main/resources/archetype-resources/crawler-conf.yaml new file mode 100644 index 000000000..5b2b18532 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Custom configuration for StormCrawler +# This is used to override the default values from crawler-default.xml and provide additional ones +# for your custom components. +# Use this file with the parameter -conf when launching your extension of ConfigurableTopology. +# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list. + +config: + topology.workers: 1 + topology.message.timeout.secs: 300 + topology.max.spout.pending: 100 + topology.debug: false + + fetcher.threads.number: 50 + + # override the JVM parameters for the workers + topology.worker.childopts: "-Xmx2g -Djava.net.preferIPv4Stack=true" + + # mandatory when using Flux + topology.kryo.register: + - org.apache.stormcrawler.Metadata + - org.apache.stormcrawler.persistence.Status + + # Lists the metadata to transfer to outlinks + # Used by Fetcher and SiteMapParser for redirections, + # discovered links, passing cookies to child pages, etc. + # These are also persisted for the parent document (see below). + # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.". + # metadata.transfer: + # - customMetadataName + + # Lists the metadata to persist to storage + # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*". + metadata.persist: + - _redirTo + - error.cause + - error.source + - isSitemap + - isFeed + + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") + http.agent.name: "${http-agent-name}" + # version of your crawler + http.agent.version: "${http-agent-version}" + # description of what it does + http.agent.description: "${http-agent-description}" + # URL webmasters can go to to learn about it + http.agent.url: "${http-agent-url}" + # Finally, an email so that they can get in touch with you + http.agent.email: "${http-agent-email}" + + http.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" + https.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" + + # The maximum number of bytes for returned HTTP response bodies. + # The fetched page will be trimmed to 65KB in this case + # Set -1 to disable the limit. + http.content.limit: 65536 + + sitemap.discovery: true + + # FetcherBolt queue dump => comment out to activate + # if a file exists on the worker machine with the corresponding port number + # the FetcherBolt will log the content of its internal queues to the logs + # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}" + + parsefilters.config.file: "parsefilters.json" + urlfilters.config.file: "urlfilters.json" + jsoup.filters.config.file: "jsoupfilters.json" + + # revisit a page daily (value in minutes) + # set it to -1 to never refetch a page + fetchInterval.default: 1440 + + # revisit a page with a fetch error after 2 hours (value in minutes) + # set it to -1 to never refetch a page + fetchInterval.fetch.error: 120 + + # never revisit a page with an error (or set a value in minutes) + fetchInterval.error: -1 + + # set to true if you don't need any text to be extracted by JSoup + textextractor.no.text: false + + # text extraction for JSoupParserBolt + textextractor.include.pattern: + - DIV[id="maincontent"] + - DIV[itemprop="articleBody"] + - ARTICLE + + textextractor.exclude.tags: + - STYLE + - SCRIPT + + # needed for parsing with Tika + jsoup.treat.non.html.as.error: false + + # restricts the documents types to be parsed with Tika + parser.mimetype.whitelist: + - application/.+word.* + - application/.+excel.* + - application/.+powerpoint.* + - application/.*pdf.* + + # Tika parser configuration file + parse.tika.config.file: "tika-config.xml" + + # custom fetch interval to be used when a document has the key/value in its metadata + # and has been fetched successfully (value in minutes) + # fetchInterval.FETCH_ERROR.isFeed=true: 30 + # fetchInterval.isFeed=true: 10 + + # configuration for the classes extending AbstractIndexerBolt + # indexer.md.filter: "someKey=aValue" + indexer.url.fieldname: "url" + indexer.text.fieldname: "content" + indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. + indexer.md.mapping: + - parse.title=title + - parse.keywords=keywords + - parse.description=description + - domain + - format + + # Metrics consumers: + topology.metrics.consumer.register: + - class: "org.apache.storm.metric.LoggingMetricsConsumer" + parallelism.hint: 1 + diff --git a/external/solr/archetype/src/main/resources/archetype-resources/crawler.flux b/external/solr/archetype/src/main/resources/archetype-resources/crawler.flux new file mode 100644 index 000000000..d8e7ad0d2 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/crawler.flux @@ -0,0 +1,109 @@ +name: "crawler" + +includes: + - resource: true + file: "/crawler-default.yaml" + override: false + + - resource: false + file: "crawler-conf.yaml" + override: true + + - resource: false + file: "solr-conf.yaml" + override: true + +spouts: + - id: "spout" + className: "org.apache.stormcrawler.solr.persistence.SolrSpout" + parallelism: 1 + +bolts: + - id: "partitioner" + className: "org.apache.stormcrawler.bolt.URLPartitionerBolt" + parallelism: 1 + + - id: "fetcher" + className: "org.apache.stormcrawler.bolt.FetcherBolt" + parallelism: 1 + + - id: "sitemap" + className: "org.apache.stormcrawler.bolt.SiteMapParserBolt" + parallelism: 1 + + - id: "parse" + className: "org.apache.stormcrawler.bolt.JSoupParserBolt" + parallelism: 1 + + - id: "index" + className: "org.apache.stormcrawler.solr.bolt.IndexerBolt" + parallelism: 1 + + - id: "status" + className: "org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt" + parallelism: 1 + + - id: "deleter" + className: "org.apache.stormcrawler.solr.bolt.DeletionBolt" + parallelism: 1 + +streams: + - from: "spout" + to: "partitioner" + grouping: + type: SHUFFLE + + - from: "partitioner" + to: "fetcher" + grouping: + type: FIELDS + args: ["key"] + + - from: "fetcher" + to: "sitemap" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "sitemap" + to: "parse" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "parse" + to: "index" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "fetcher" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "sitemap" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "parse" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "index" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "status" + to: "deleter" + grouping: + type: LOCAL_OR_SHUFFLE + streamId: "deletion" diff --git a/external/solr/archetype/src/main/resources/archetype-resources/injection.flux b/external/solr/archetype/src/main/resources/archetype-resources/injection.flux new file mode 100644 index 000000000..3abcdee09 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/injection.flux @@ -0,0 +1,50 @@ +name: "injection" + +includes: + - resource: true + file: "/crawler-default.yaml" + override: false + + - resource: false + file: "crawler-conf.yaml" + override: true + + - resource: false + file: "solr-conf.yaml" + override: true + +spouts: + - id: "filespout" + className: "org.apache.stormcrawler.spout.FileSpout" + parallelism: 1 + constructorArgs: + - "." + - "seeds.txt" + - true + +bolts: + - id: "filter" + className: "org.apache.stormcrawler.bolt.URLFilterBolt" + parallelism: 1 + + - id: "status" + className: "org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt" + parallelism: 1 + +streams: + - from: "filespout" + to: "filter" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "filter" + to: "status" + grouping: + streamId: "status" + type: CUSTOM + customClass: + className: "org.apache.stormcrawler.util.URLStreamGrouping" + constructorArgs: + - "byDomain" diff --git a/external/solr/archetype/src/main/resources/archetype-resources/pom.xml b/external/solr/archetype/src/main/resources/archetype-resources/pom.xml new file mode 100644 index 000000000..628e06dff --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/pom.xml @@ -0,0 +1,154 @@ + + + + + + + 4.0.0 + ${groupId} + ${artifactId} + ${version} + jar + + ${artifactId} + + + UTF-8 + ${StormCrawlerVersion} + 2.7.0 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + 11 + 11 + + + + org.codehaus.mojo + exec-maven-plugin + 3.1.0 + + + + exec + + + + + java + true + false + compile + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + false + + + + org.apache.storm.flux.Flux + + + + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + org.apache.storm:flux-core + + org/apache/commons/** + org/apache/http/** + org/yaml/** + + + + + + + + + + + + + org.apache.stormcrawler + stormcrawler-core + ${stormcrawler.version} + + + org.apache.storm + storm-client + ${storm.version} + provided + + + org.apache.storm + flux-core + ${storm.version} + + + org.apache.stormcrawler + stormcrawler-tika + ${stormcrawler.version} + + + org.apache.stormcrawler + stormcrawler-urlfrontier + ${stormcrawler.version} + + + org.apache.stormcrawler + stormcrawler-solr + ${stormcrawler.version} + + + diff --git a/external/solr/archetype/src/main/resources/archetype-resources/seeds.txt b/external/solr/archetype/src/main/resources/archetype-resources/seeds.txt new file mode 100644 index 000000000..12343e9f0 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/seeds.txt @@ -0,0 +1 @@ +https://stormcrawler.apache.org/ diff --git a/external/solr/archetype/src/main/resources/archetype-resources/setup-solr.sh b/external/solr/archetype/src/main/resources/archetype-resources/setup-solr.sh new file mode 100755 index 000000000..56b1bffb6 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/setup-solr.sh @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +STATUS_SHARDS=$(grep -E '^[^#]*solr.status.routing.shards' solr-conf.yaml | sed -e 's/.*: //' | tr -d ' ') +ROUTER_FIELD=$(grep -E '^[^#]*solr.status.routing.fieldname' solr-conf.yaml | sed -e 's/.*: //' | tr -d ' ') + +if [ -z "$STATUS_SHARDS" ]; then + echo -e "\e[1mProperty 'solr.status.routing.shards not set in solr-conf.yaml'. Defaulting to 1 ...\e[0m\n" + STATUS_SHARDS=1 +fi + +if [ -z "$ROUTER_FIELD" ]; then + echo -e "\e[1mProperty 'solr.status.routing.fieldname' not set in solr-conf.yaml. Defaulting to 'key' ...\e[0m\n" + ROUTER_FIELD="\"key\"" +fi + +SOLR_PORT=8983 +SOLR_HOME=/opt/solr-9.7.0 + +$SOLR_HOME/bin/solr start -cloud -p $SOLR_PORT + +echo -e "\n\e[1mUploading configsets ...\e[0m\n" + +$SOLR_HOME/bin/solr zk upconfig -n "docs" -d configsets/docs -z localhost:9983 +$SOLR_HOME/bin/solr zk upconfig -n "status" -d configsets/status -z localhost:9983 +$SOLR_HOME/bin/solr zk upconfig -n "metrics" -d configsets/metrics -z localhost:9983 + +echo -e "\n\n\e[1mCreating 'docs' collection ...\e[0m\n" +curl -X POST "http://localhost:$SOLR_PORT/api/collections" -H "Content-type:application/json" -d ' + { + "name": "docs", + "numShards": 1, + "replicationFactor": 1, + "config": "docs" + }' + +echo -e "\n\n\e[1mCreating 'status' collection with $STATUS_SHARDS shard(s) and routing based on '$ROUTER_FIELD' ...\e[0m\n" +curl -X POST "http://localhost:$SOLR_PORT/api/collections" -H "Content-type:application/json" -d ' + { + "name": "status", + "numShards": '$STATUS_SHARDS', + "replicationFactor": 1, + "router": { + "name": "compositeId", + "field": '$ROUTER_FIELD' + }, + "config": "status" + }' + +echo -e "\n\n\e[1mCreating 'metrics' collection ...\e[0m\n" +curl -X POST "http://localhost:$SOLR_PORT/api/collections" -H "Content-type:application/json" -d ' + { + "name": "metrics", + "numShards": 1, + "replicationFactor": 1, + "config": "metrics" + }' diff --git a/external/solr/solr-conf.yaml b/external/solr/archetype/src/main/resources/archetype-resources/solr-conf.yaml similarity index 92% rename from external/solr/solr-conf.yaml rename to external/solr/archetype/src/main/resources/archetype-resources/solr-conf.yaml index b9d310dab..3790035ce 100644 --- a/external/solr/solr-conf.yaml +++ b/external/solr/archetype/src/main/resources/archetype-resources/solr-conf.yaml @@ -14,7 +14,7 @@ # limitations under the License. # configuration for SOLR resources - + config: solr.indexer.url: "http://localhost:8983/solr/docs" @@ -26,20 +26,22 @@ config: solr.status.max.results: 10 # the routing is done on the value of 'partition.url.mode' - # stores the value used for grouping the URLs as a separate field solr.status.routing.fieldname: "key" + # number of shards for the 'status' collection + solr.status.routing.shards: 1 + # time in secs for which the URLs will be considered for fetching after a ack or fail spout.ttl.purgatory: 30 - + # Min time (in msecs) to allow between 2 successive queries to SOLR spout.min.delay.queries: 2000 - + # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results # might be returned. spout.reset.fetchdate.after: 120 - + # Solr MetricsConsumer solr.metrics.url: "http://localhost:8983/solr/metrics" # solr.metrics.ttl.field: '__ttl__' @@ -53,5 +55,5 @@ config: # the same applies for the spout/persistence bolt and the metricsconsumer topology.metrics.consumer.register: - - class: "org.apache.stormcrawler.solr.metrics.MetricsConsumer" - parallelism.hint: 1 + - class: "org.apache.stormcrawler.solr.metrics.MetricsConsumer" + parallelism.hint: 1 diff --git a/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt new file mode 100644 index 000000000..389ef587b --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt @@ -0,0 +1,32 @@ +# skip file: ftp: and mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't parse or are not likely to be relevant +# if you want to crawl images or videos or archives then you should comment out this line +-(?i)\.(apk|deb|cab|iso|gif|jpg|png|svg|ico|css|sit|eps|wmf|rar|tar|jar|zip|gz|bz2|rpm|tgz|mov|exe|jpeg|jpe|bmp|js|mpg|mp3|mp4|m4a|ogv|kml|wmv|swf|flv|mkv|m4v|webm|ra|wma|wav|avi|xspf|m3u)(\?|&|$) + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +# very time-consuming : use BasicURLFilter instead +# -.*(/[^/]+)/[^/]+\1/[^/]+\1/ + +# exclude localhost and equivalents to avoid that information +# can be leaked by placing faked links pointing to web interfaces +# of services running on the crawling machine (e.g., Elasticsearch, +# Storm) +# +# - exclude localhost and loop-back addresses +# http://localhost:8080 +# http://127.0.0.1/ .. http://127.255.255.255/ +# http://[::1]/ +-^https?://(?:localhost|127(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3}|\[::1\])(?::\d+)?(?:/|$) +# +# - exclude private IP address spaces +# 10.0.0.0/8 +-^https?://(?:10(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3})(?::\d+)?(?:/|$) +# 192.168.0.0/16 +-^https?://(?:192\.168(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) +# 172.16.0.0/12 +-^https?://(?:172\.(?:1[6789]|2[0-9]|3[01])(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) + +# accept anything else ++. diff --git a/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml new file mode 100644 index 000000000..101bfd6b5 --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json new file mode 100644 index 000000000..4d87d8d5a --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json @@ -0,0 +1,27 @@ +{ + "org.apache.stormcrawler.parse.JSoupFilters": [ + { + "class": "org.apache.stormcrawler.jsoup.XPathFilter", + "name": "XPathFilter", + "params": { + "canonical": "//*[@rel=\"canonical\"]/@href", + "parse.description": [ + "//*[@name=\"description\"]/@content", + "//*[@name=\"Description\"]/@content" + ], + "parse.title": [ + "//TITLE/allText()", + "//META[@name=\"title\"]/@content" + ], + "parse.keywords": "//META[@name=\"keywords\"]/@content" + } + }, + { + "class": "org.apache.stormcrawler.jsoup.LinkParseFilter", + "name": "LinkParseFilter", + "params": { + "pattern": "//FRAME/@src" + } + } + ] +} diff --git a/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json new file mode 100644 index 000000000..5d525830d --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json @@ -0,0 +1,23 @@ +{ + "org.apache.stormcrawler.parse.ParseFilters": [ + { + "class": "org.apache.stormcrawler.parse.filter.DomainParseFilter", + "name": "DomainParseFilter", + "params": { + "key": "domain", + "byHost": false + } + }, + { + "class": "org.apache.stormcrawler.parse.filter.MimeTypeNormalization", + "name": "MimeTypeNormalization" + }, + { + "class": "org.apache.stormcrawler.parse.filter.CommaSeparatedToMultivaluedMetadata", + "name": "CommaSeparatedToMultivaluedMetadata", + "params": { + "keys": ["parse.keywords"] + } + } + ] +} diff --git a/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json new file mode 100644 index 000000000..6098631bb --- /dev/null +++ b/external/solr/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json @@ -0,0 +1,60 @@ +{ + "org.apache.stormcrawler.filtering.URLFilters": [ + { + "class": "org.apache.stormcrawler.filtering.basic.BasicURLFilter", + "name": "BasicURLFilter", + "params": { + "maxPathRepetition": 3, + "maxLength": 1024 + } + }, + { + "class": "org.apache.stormcrawler.filtering.depth.MaxDepthFilter", + "name": "MaxDepthFilter", + "params": { + "maxDepth": -1 + } + }, + { + "class": "org.apache.stormcrawler.filtering.basic.BasicURLNormalizer", + "name": "BasicURLNormalizer", + "params": { + "removeAnchorPart": true, + "unmangleQueryString": true, + "checkValidURI": true, + "removeHashes": true, + "hostIDNtoASCII": true + } + }, + { + "class": "org.apache.stormcrawler.filtering.host.HostURLFilter", + "name": "HostURLFilter", + "params": { + "ignoreOutsideHost": false, + "ignoreOutsideDomain": true + } + }, + { + "class": "org.apache.stormcrawler.filtering.regex.RegexURLNormalizer", + "name": "RegexURLNormalizer", + "params": { + "regexNormalizerFile": "default-regex-normalizers.xml" + } + }, + { + "class": "org.apache.stormcrawler.filtering.regex.RegexURLFilter", + "name": "RegexURLFilter", + "params": { + "regexFilterFile": "default-regex-filters.txt" + } + }, + { + "class": "org.apache.stormcrawler.filtering.basic.SelfURLFilter", + "name": "SelfURLFilter" + }, + { + "class": "org.apache.stormcrawler.filtering.sitemap.SitemapFilter", + "name": "SitemapFilter" + } + ] +} diff --git a/external/solr/cores/docs/conf/schema.xml b/external/solr/configsets/docs/conf/schema.xml similarity index 93% rename from external/solr/cores/docs/conf/schema.xml rename to external/solr/configsets/docs/conf/schema.xml index dbc2bc9a2..5606a1d13 100755 --- a/external/solr/cores/docs/conf/schema.xml +++ b/external/solr/configsets/docs/conf/schema.xml @@ -17,7 +17,7 @@ KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + - 7.5.0 + 9.0.0 diff --git a/external/solr/cores/docs/conf/stopwords.txt b/external/solr/configsets/docs/conf/stopwords.txt similarity index 100% rename from external/solr/cores/docs/conf/stopwords.txt rename to external/solr/configsets/docs/conf/stopwords.txt diff --git a/external/solr/cores/docs/conf/synonyms.txt b/external/solr/configsets/docs/conf/synonyms.txt similarity index 100% rename from external/solr/cores/docs/conf/synonyms.txt rename to external/solr/configsets/docs/conf/synonyms.txt diff --git a/external/solr/cores/metrics/conf/schema.xml b/external/solr/configsets/metrics/conf/schema.xml similarity index 91% rename from external/solr/cores/metrics/conf/schema.xml rename to external/solr/configsets/metrics/conf/schema.xml index b9d95a46d..2d727e96a 100755 --- a/external/solr/cores/metrics/conf/schema.xml +++ b/external/solr/configsets/metrics/conf/schema.xml @@ -17,7 +17,7 @@ KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + @@ -29,9 +29,11 @@ under the License. + + + id - diff --git a/external/solr/cores/metrics/conf/solrconfig.xml b/external/solr/configsets/metrics/conf/solrconfig.xml similarity index 97% rename from external/solr/cores/metrics/conf/solrconfig.xml rename to external/solr/configsets/metrics/conf/solrconfig.xml index f703df44e..07bff00a8 100755 --- a/external/solr/cores/metrics/conf/solrconfig.xml +++ b/external/solr/configsets/metrics/conf/solrconfig.xml @@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License. --> - 7.5.0 + 9.0.0 diff --git a/external/solr/cores/status/conf/schema.xml b/external/solr/configsets/status/conf/schema.xml similarity index 89% rename from external/solr/cores/status/conf/schema.xml rename to external/solr/configsets/status/conf/schema.xml index 8ed1207b1..16b8e6f78 100755 --- a/external/solr/cores/status/conf/schema.xml +++ b/external/solr/configsets/status/conf/schema.xml @@ -17,9 +17,13 @@ KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + + + + + diff --git a/external/solr/cores/status/conf/solrconfig.xml b/external/solr/configsets/status/conf/solrconfig.xml similarity index 96% rename from external/solr/cores/status/conf/solrconfig.xml rename to external/solr/configsets/status/conf/solrconfig.xml index ec916f908..acfa317d2 100755 --- a/external/solr/cores/status/conf/solrconfig.xml +++ b/external/solr/configsets/status/conf/solrconfig.xml @@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License. --> - 7.5.0 + 9.0.0 diff --git a/external/solr/cores/docs/core.properties b/external/solr/cores/docs/core.properties deleted file mode 100644 index 5ce30361a..000000000 --- a/external/solr/cores/docs/core.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#Written by CorePropertiesLocator -#Sun Jun 14 04:10:47 CDT 2015 -name=docs -config=solrconfig.xml -schema=schema.xml -dataDir=data diff --git a/external/solr/cores/metrics/core.properties b/external/solr/cores/metrics/core.properties deleted file mode 100644 index dcfd29126..000000000 --- a/external/solr/cores/metrics/core.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#Written by CorePropertiesLocator -#Sun Jun 14 04:10:47 CDT 2015 -name=metrics -config=solrconfig.xml -schema=schema.xml -dataDir=data diff --git a/external/solr/cores/solr.xml b/external/solr/cores/solr.xml deleted file mode 100644 index 9ec41dcf5..000000000 --- a/external/solr/cores/solr.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - diff --git a/external/solr/src/main/java/org/apache/stormcrawler/solr/SeedInjector.java b/external/solr/src/main/java/org/apache/stormcrawler/solr/SeedInjector.java deleted file mode 100644 index cf765f1ac..000000000 --- a/external/solr/src/main/java/org/apache/stormcrawler/solr/SeedInjector.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.solr; - -import org.apache.storm.topology.TopologyBuilder; -import org.apache.storm.tuple.Fields; -import org.apache.stormcrawler.ConfigurableTopology; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt; -import org.apache.stormcrawler.spout.FileSpout; - -/** - * Topology which reads from a file containing seeds and distributes to SQS queues based on the IP / - * hostname / domain of the URLs. Used in local mode to bootstrap a crawl. - */ -public class SeedInjector extends ConfigurableTopology { - - public static void main(String[] args) throws Exception { - ConfigurableTopology.start(new SeedInjector(), args); - } - - @Override - public int run(String[] args) { - - if (args.length == 0) { - System.err.println("SeedInjector seed_dir file_filter"); - return -1; - } - - conf.setDebug(false); - - TopologyBuilder builder = new TopologyBuilder(); - - builder.setSpout("spout", new FileSpout(args[0], args[1], true)); - - Fields key = new Fields("url"); - - builder.setBolt("enqueue", new StatusUpdaterBolt()) - .fieldsGrouping("spout", Constants.StatusStreamName, key); - - return submit("SeedInjector", conf, builder); - } -} diff --git a/external/solr/src/main/java/org/apache/stormcrawler/solr/SolrCrawlTopology.java b/external/solr/src/main/java/org/apache/stormcrawler/solr/SolrCrawlTopology.java deleted file mode 100644 index ee97d9170..000000000 --- a/external/solr/src/main/java/org/apache/stormcrawler/solr/SolrCrawlTopology.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.solr; - -import org.apache.storm.topology.TopologyBuilder; -import org.apache.storm.tuple.Fields; -import org.apache.stormcrawler.ConfigurableTopology; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.bolt.FetcherBolt; -import org.apache.stormcrawler.bolt.JSoupParserBolt; -import org.apache.stormcrawler.bolt.SiteMapParserBolt; -import org.apache.stormcrawler.bolt.URLPartitionerBolt; -import org.apache.stormcrawler.solr.bolt.DeletionBolt; -import org.apache.stormcrawler.solr.bolt.IndexerBolt; -import org.apache.stormcrawler.solr.metrics.MetricsConsumer; -import org.apache.stormcrawler.solr.persistence.SolrSpout; -import org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt; - -/** Dummy topology to play with the spouts and bolts on Solr */ -public class SolrCrawlTopology extends ConfigurableTopology { - - public static void main(String[] args) throws Exception { - ConfigurableTopology.start(new SolrCrawlTopology(), args); - } - - @Override - protected int run(String[] args) { - TopologyBuilder builder = new TopologyBuilder(); - - builder.setSpout("spout", new SolrSpout()); - - builder.setBolt("partitioner", new URLPartitionerBolt()).shuffleGrouping("spout"); - - builder.setBolt("fetch", new FetcherBolt()) - .fieldsGrouping("partitioner", new Fields("key")); - - builder.setBolt("sitemap", new SiteMapParserBolt()).localOrShuffleGrouping("fetch"); - - builder.setBolt("parse", new JSoupParserBolt()).localOrShuffleGrouping("sitemap"); - - builder.setBolt("indexer", new IndexerBolt()).localOrShuffleGrouping("parse"); - - builder.setBolt("status", new StatusUpdaterBolt()) - .localOrShuffleGrouping("fetch", Constants.StatusStreamName) - .localOrShuffleGrouping("sitemap", Constants.StatusStreamName) - .localOrShuffleGrouping("parse", Constants.StatusStreamName) - .localOrShuffleGrouping("indexer", Constants.StatusStreamName); - - builder.setBolt("deleter", new DeletionBolt()) - .localOrShuffleGrouping("status", Constants.DELETION_STREAM_NAME); - - conf.registerMetricsConsumer(MetricsConsumer.class); - - return submit("crawl", conf, builder); - } -} diff --git a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/SolrSpout.java b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/SolrSpout.java index e45e1c80d..5e62560fc 100644 --- a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/SolrSpout.java +++ b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/SolrSpout.java @@ -19,6 +19,7 @@ import java.time.Instant; import java.util.Collection; import java.util.Iterator; +import java.util.Locale; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrQuery; @@ -32,11 +33,17 @@ import org.apache.storm.task.TopologyContext; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.persistence.AbstractQueryingSpout; +import org.apache.stormcrawler.solr.Constants; import org.apache.stormcrawler.solr.SolrConnection; import org.apache.stormcrawler.util.ConfUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * Spout which pulls URLs from a Solr index. The number of Spout instances should be the same as the + * number of Solr shards (`solr.status.routing.shards`). Guarantees a good mix of URLs by + * aggregating them by an arbitrary field e.g. key. + */ @SuppressWarnings("serial") public class SolrSpout extends AbstractQueryingSpout { @@ -49,6 +56,12 @@ public class SolrSpout extends AbstractQueryingSpout { private static final String SolrMetadataPrefix = "solr.status.metadata.prefix"; private static final String SolrMaxResultsParam = "solr.status.max.results"; + private static final String SolrShardsParamName = Constants.PARAMPREFIX + "%s.routing.shards"; + + private int solrShards; + + private int shardID = 1; + private SolrConnection connection; private int maxNumResults = 10; @@ -71,13 +84,19 @@ public void open( super.open(stormConf, context, collector); - // This implementation works only where there is a single instance - // of the spout. Having more than one instance means that they would run - // the same queries and send the same tuples down the topology. + solrShards = + ConfUtils.getInt( + stormConf, + String.format(Locale.ROOT, SolrSpout.SolrShardsParamName, BOLT_TYPE), + 1); int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); - if (totalTasks > 1) { - throw new RuntimeException("Can't have more than one instance of SOLRSpout"); + if (totalTasks != solrShards) { + throw new RuntimeException( + "Number of SolrSpout instances should be the same as 'status' collection shards"); + } else { + // Solr uses 1-based indexing in shard names (shard1, shard2, ...) + shardID = context.getThisTaskIndex() + 1; } diversityField = ConfUtils.getString(stormConf, SolrDiversityFieldParam); @@ -137,6 +156,11 @@ else if (resetFetchDateAfterNSecs != -1) { .addFilterQuery("nextFetchDate:[* TO " + lastNextFetchDate + "]") .setSort("nextFetchDate", ORDER.asc); + // add the shard parameter only when having multiple shards + if (solrShards > 1) { + query.setParam("shards", "shard" + shardID); + } + if (StringUtils.isNotBlank(diversityField) && diversityBucketSize > 0) { String[] diversityFields = diversityField.split(","); query.setStart(0) @@ -156,10 +180,15 @@ else if (resetFetchDateAfterNSecs != -1) { LOG.debug("QUERY => {}", query); try { + LOG.trace("isInQuery set to true"); + isInQuery.set(true); + long startQuery = System.currentTimeMillis(); QueryResponse response = connection.getClient().query(query); long endQuery = System.currentTimeMillis(); + markQueryReceivedNow(); + queryTimes.addMeasurement(endQuery - startQuery); SolrDocumentList docs = new SolrDocumentList(); diff --git a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/IndexerBoltTest.java b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/IndexerBoltTest.java index 3b3031c2e..f14a5cad1 100644 --- a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/IndexerBoltTest.java +++ b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/IndexerBoltTest.java @@ -52,9 +52,9 @@ public class IndexerBoltTest extends SolrContainerTest { private static final Logger LOG = LoggerFactory.getLogger(IndexerBoltTest.class); @Before - public void setupIndexerBolt() throws IOException, InterruptedException { + public void setup() throws IOException, InterruptedException { container.start(); - createCore("docs"); + createCollection("docs", 1); bolt = new IndexerBolt(); output = new TestOutputCollector(); diff --git a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SolrContainerTest.java b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SolrContainerTest.java index fe426c8c6..19da88139 100644 --- a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SolrContainerTest.java +++ b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SolrContainerTest.java @@ -36,14 +36,16 @@ public abstract class SolrContainerTest { protected static ExecutorService executorService; private final DockerImageName image = DockerImageName.parse("solr:9.7.0"); - private static final String coresPath = new File("cores").getAbsolutePath(); + private static final String configsetsPath = new File("configsets").getAbsolutePath(); @Rule public GenericContainer container = new GenericContainer<>(image) .withExposedPorts(8983) .withCopyFileToContainer( - MountableFile.forHostPath(coresPath), "/opt/solr/server/solr/cores") + MountableFile.forHostPath(configsetsPath), + "/opt/solr/server/solr/configsets") + .withCommand("solr-foreground -cloud") .waitingFor(Wait.forHttp("/solr/admin/cores?action=STATUS").forStatusCode(200)); @BeforeClass @@ -61,14 +63,32 @@ protected String getSolrBaseUrl() { return "http://" + container.getHost() + ":" + container.getMappedPort(8983) + "/solr"; } - protected Container.ExecResult createCore(String coreName) + protected Container.ExecResult createCollection(String collectionName, int shards) throws IOException, InterruptedException { + + // Upload configuration to Zookeeper + container.execInContainer( + "/opt/solr/bin/solr", + "zk", + "upconfig", + "-n", + collectionName, + "-d", + "/opt/solr/server/solr/configsets/" + collectionName, + "-z", + "localhost:9983"); + + // Create the collection return container.execInContainer( "/opt/solr/bin/solr", "create", "-c", - coreName, - "-d", - "/opt/solr/server/solr/cores/" + coreName); + collectionName, + "-n", + collectionName, + "-s", + String.valueOf(shards), + "-rf", + "1"); } } diff --git a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SpoutTest.java b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SpoutTest.java new file mode 100644 index 000000000..ce0900122 --- /dev/null +++ b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/SpoutTest.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.solr.persistence; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.storm.spout.SpoutOutputCollector; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.TestOutputCollector; +import org.apache.stormcrawler.TestUtil; +import org.apache.stormcrawler.persistence.Status; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Verifies that when having 2 Solr shards, documents will be processed by the 2 respective Spouts + */ +public class SpoutTest extends SolrContainerTest { + + private StatusUpdaterBolt bolt; + private SolrSpout spoutOne; + private SolrSpout spoutTwo; + private TestOutputCollector boltOutput; + private TestOutputCollector spoutOneOutput; + private TestOutputCollector spoutTwoOutput; + + private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class); + + @Before + public void setup() throws IOException, InterruptedException { + container.start(); + createCollection("status", 2); + + bolt = new StatusUpdaterBolt(); + boltOutput = new TestOutputCollector(); + + Map conf = new HashMap<>(); + conf.put("scheduler.class", "org.apache.stormcrawler.persistence.DefaultScheduler"); + conf.put("status.updater.cache.spec", "maximumSize=10000,expireAfterAccess=1h"); + conf.put("solr.status.url", getSolrBaseUrl() + "/status"); + + conf.put("solr.status.routing.shards", 2); + conf.put( + "urlbuffer.class", "org.apache.stormcrawler.persistence.urlbuffer.SimpleURLBuffer"); + + bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(boltOutput)); + + spoutOne = new SolrSpout(); + spoutTwo = new SolrSpout(); + spoutOneOutput = new TestOutputCollector(); + spoutTwoOutput = new TestOutputCollector(); + + spoutOne.open(conf, getContextForTask(0), new SpoutOutputCollector(spoutOneOutput)); + spoutTwo.open(conf, getContextForTask(1), new SpoutOutputCollector(spoutTwoOutput)); + } + + @After + public void close() { + LOG.info("Closing updater bolt and SOLR container"); + bolt.cleanup(); + spoutOne.close(); + spoutTwo.close(); + container.close(); + boltOutput = null; + spoutOneOutput = null; + spoutTwoOutput = null; + } + + private TopologyContext getContextForTask(int taskId) { + Map taskToComponent = new HashMap<>(); + Map> componentToTasks = new HashMap<>(); + + taskToComponent.put(0, "solrSpout"); + taskToComponent.put(1, "solrSpout"); + componentToTasks.put("solrSpout", Arrays.asList(0, 1)); + + // Mock the task related components of the context + return new TopologyContext( + null, + Map.of("storm.cluster.mode", "local"), + taskToComponent, + componentToTasks, + new HashMap<>(), + null, + null, + null, + null, + taskId, + null, + null, + null, + null, + null, + new HashMap<>(), + new AtomicBoolean(false), + null); + } + + private Future store(String url, Status status, Metadata metadata) { + Tuple tuple = mock(Tuple.class); + when(tuple.getValueByField("status")).thenReturn(status); + when(tuple.getStringByField("url")).thenReturn(url); + when(tuple.getValueByField("metadata")).thenReturn(metadata); + bolt.execute(tuple); + + return executorService.submit( + () -> { + var outputSize = boltOutput.getAckedTuples().size(); + while (outputSize == 0) { + Thread.sleep(100); + outputSize = boltOutput.getAckedTuples().size(); + } + return outputSize; + }); + } + + /** + * When using two shards,
+ * the status documents should be distributed among the two spouts + */ + @Test(timeout = 120000) + public void twoShardsTest() throws ExecutionException, InterruptedException, TimeoutException { + int expected = 100; + + for (int i = 0; i < expected; i++) { + String url = "https://" + i + "/something"; + Metadata md = new Metadata(); + md.addValue("someKey", "someValue"); + store(url, Status.DISCOVERED, md).get(10, TimeUnit.SECONDS); + } + + spoutOne.activate(); + spoutTwo.activate(); + + while (spoutOneOutput.getEmitted().size() + spoutTwoOutput.getEmitted().size() < expected) { + spoutOne.nextTuple(); + spoutTwo.nextTuple(); + } + + assertFalse(spoutOneOutput.getEmitted().isEmpty()); + assertFalse(spoutTwoOutput.getEmitted().isEmpty()); + assertEquals( + expected, spoutOneOutput.getEmitted().size() + spoutTwoOutput.getEmitted().size()); + } +} diff --git a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/StatusBoltTest.java b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/StatusBoltTest.java index 9a85cc9fd..931483326 100644 --- a/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/StatusBoltTest.java +++ b/external/solr/src/test/java/org/apache/stormcrawler/solr/persistence/StatusBoltTest.java @@ -46,9 +46,9 @@ public class StatusBoltTest extends SolrContainerTest { private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class); @Before - public void setupStatusBolt() throws IOException, InterruptedException { + public void setup() throws IOException, InterruptedException { container.start(); - createCore("status"); + createCollection("status", 4); bolt = new StatusUpdaterBolt(); output = new TestOutputCollector(); diff --git a/pom.xml b/pom.xml index f82649062..4750d50fc 100644 --- a/pom.xml +++ b/pom.xml @@ -626,6 +626,7 @@ under the License. external/warc archetype external/opensearch/archetype - + external/solr/archetype +