Skip to content

Commit

Permalink
Upgrade Tika to 3.0.0 and lighten TIKA dependency tree
Browse files Browse the repository at this point in the history
  • Loading branch information
rzo1 committed Oct 22, 2024
1 parent fcfcda9 commit 38ac2b1
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 44 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,18 @@ An example on how to configure form authentication in its most simple form:

Overriding `FormAuthInfo.doFormLogin(...)` allows implementing more dynamic form authentication (entering more fields, fetching dynamic login forms, ...).

## Binary Content

To lighten our dependencies, we replaced the default Apache Tika standard package with the HTML module only.
If you need more parsing capabilities, just add the related dependencies:
```xml
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard</artifactId>
<version>VERSION</version>
</dependency>
```

## High-level design diagrams

Below activity diagram highlights the most important steps and vocabulary to know as user of the library.
Expand Down
28 changes: 14 additions & 14 deletions crawler4j-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<artifactId>tika-parser-html-module</artifactId>
<version>${apache.tika.version}</version>
<exclusions>
<exclusion>
Expand All @@ -186,19 +186,19 @@
</exclusion>
</exclusions>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>${assertj.version}</version>
<scope>test</scope>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>${assertj.version}</version>
<scope>test</scope>
</dependency>
<!-- Test Dependencies -->
<dependency>
<groupId>de.hs-heilbronn.mi</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.html.JSoupParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -51,7 +52,7 @@ public class TikaHtmlParser implements edu.uci.ics.crawler4j.parser.HtmlParser {
private final CrawlConfig config;
private final TLDList tldList;

private final HtmlParser htmlParser;
private final Parser htmlParser;
private final ParseContext parseContext;
private final WebURLFactory factory;
private final BasicURLNormalizer normalizer;
Expand All @@ -61,7 +62,7 @@ public TikaHtmlParser(CrawlConfig config, BasicURLNormalizer normalizer, TLDList
this.tldList = tldList;
this.normalizer = normalizer;

htmlParser = new HtmlParser();
htmlParser = new JSoupParser();
parseContext = new ParseContext();
parseContext.set(HtmlMapper.class, new AllTagMapper());
this.factory = webURLFactory;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package edu.uci.ics.crawler4j.tests.fetcher.politeness;

import org.assertj.core.api.Assertions;
import org.awaitility.Awaitility;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

Expand All @@ -29,6 +30,8 @@
import edu.uci.ics.crawler4j.test.SimpleWebURL;
import edu.uci.ics.crawler4j.url.WebURL;

import java.util.concurrent.TimeUnit;

public class SimplePolitenessServerTestCase {

private edu.uci.ics.crawler4j.PolitenessServer simplePolitenessServer;
Expand All @@ -54,8 +57,8 @@ public void testApplyPoliteness1() {
politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);

Assertions.assertThat(politenessDelay).isBetween(//
Long.valueOf(config.getPolitenessDelay() - 10)//
, Long.valueOf(config.getPolitenessDelay()));
(long) (config.getPolitenessDelay() - 10)//
, (long) config.getPolitenessDelay());

}

Expand All @@ -74,15 +77,13 @@ public void testApplyPoliteness2() {
politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);

Assertions.assertThat(politenessDelay).isBetween(//
Long.valueOf(config.getPolitenessDelay() - 10)//
, Long.valueOf(config.getPolitenessDelay()));

//let's wait some time, it should not be listed anymore
sleep(1000);

politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);
(long) (config.getPolitenessDelay() - 10)//
, (long) config.getPolitenessDelay());

Assertions.assertThat(politenessDelay).isEqualTo(CachedPolitenessServer.NO_POLITENESS_APPLIED);
Awaitility.await().atMost(10, TimeUnit.SECONDS).untilAsserted(() -> {
final long delay = simplePolitenessServer.applyPoliteness(webUrl);
Assertions.assertThat(delay).isEqualTo(CachedPolitenessServer.NO_POLITENESS_APPLIED);
});

}

Expand All @@ -101,30 +102,20 @@ public void testApplyPoliteness3() {
politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);

Assertions.assertThat(politenessDelay).isBetween(//
Long.valueOf(config.getPolitenessDelay() - 10)//
, Long.valueOf(config.getPolitenessDelay()));
(long) (config.getPolitenessDelay() - 10)//
, (long) config.getPolitenessDelay());

webUrl.setURL("https://github.com/yasserg/crawler4j/blob/master/pom.xml");

politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);

Assertions.assertThat(politenessDelay).isEqualTo(config.getPolitenessDelay());

//let's wait some time, it should not be listed anymore
sleep(3000);

politenessDelay = simplePolitenessServer.applyPoliteness(webUrl);

Assertions.assertThat(politenessDelay).isEqualTo(CachedPolitenessServer.NO_POLITENESS_APPLIED);

}
Awaitility.await().atMost(10, TimeUnit.SECONDS).untilAsserted(() -> {
final long delay = simplePolitenessServer.applyPoliteness(webUrl);
Assertions.assertThat(delay).isEqualTo(CachedPolitenessServer.NO_POLITENESS_APPLIED);
});

private void sleep(int i) {
try {
Thread.sleep(i);
} catch (InterruptedException e) {
//nothing to do here
}
}

}
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
<apache.http.core.version>5.3</apache.http.core.version>
<apache.http.core.h2.version>5.3</apache.http.core.h2.version>

<apache.tika.version>2.9.1</apache.tika.version>
<apache.tika.version>3.0.0</apache.tika.version>
<!-- XXX replace with crawler commons -->
<url-detector.version>0.1.23</url-detector.version>
<crawler-commons.version>1.3</crawler-commons.version>
Expand Down

0 comments on commit 38ac2b1

Please sign in to comment.