Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

commit1 #10

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
package com.udacity.webcrawler;

import com.udacity.webcrawler.json.CrawlResult;
import com.udacity.webcrawler.parser.PageParser;
import com.udacity.webcrawler.parser.PageParserFactory;

import javax.inject.Inject;
import javax.inject.Provider;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ForkJoinPool;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
Expand All @@ -21,27 +26,85 @@
*/
final class ParallelWebCrawler implements WebCrawler {
private final Clock clock;
private final PageParserFactory parserFactory;
private final Duration timeout;
private final int popularWordCount;
private final ForkJoinPool pool;
private List<Pattern> ignoredUrls = null;
private final int maxDepth;

@Inject
ParallelWebCrawler(
Clock clock,
PageParserFactory parserFactory,
@Timeout Duration timeout,
@PopularWordCount int popularWordCount,
@TargetParallelism int threadCount) {
@TargetParallelism int threadCount,
@MaxDepth int maxDepth,
@IgnoredUrls List<Pattern> ignoredUrls) {
this.clock = clock;
this.parserFactory = parserFactory;
this.timeout = timeout;
this.popularWordCount = popularWordCount;
this.pool = new ForkJoinPool(Math.min(threadCount, getMaxParallelism()));
this.maxDepth = maxDepth;
this.ignoredUrls = ignoredUrls;
}

@Override
public CrawlResult crawl(List<String> startingUrls) {
return new CrawlResult.Builder().build();
Instant deadline = clock.instant().plus(timeout);
Map<String, Integer> counts = new HashMap<>();
Set<String> visitedUrls = new HashSet<>();

for (String url : startingUrls) {
crawlInternal(url, deadline, maxDepth, counts, visitedUrls);
}
if (counts.isEmpty()) {
return new CrawlResult.Builder()
.setWordCounts(counts)
.setUrlsVisited(visitedUrls.size())
.build();
}
return new CrawlResult.Builder()
.setWordCounts(WordCounts.sort(counts, popularWordCount))
.setUrlsVisited(visitedUrls.size())
.build();
}

//#
private void crawlInternal(
String url,
Instant deadline,
int maxDepth,
Map<String, Integer> counts,
Set<String> visitedUrls) {

if (maxDepth == 0 || clock.instant().isAfter(deadline)) {
return;
}
for (Pattern pattern : ignoredUrls) {
if (pattern.matcher(url).matches()) {
return;
}
}
if (visitedUrls.contains(url)) {
return;
}
visitedUrls.add(url);

PageParser.Result result = parserFactory.get(url).parse();
for (Map.Entry<String, Integer> e : result.getWordCounts().entrySet()) {
if (counts.containsKey(e.getKey())) {
counts.put(e.getKey(), e.getValue() + counts.get(e.getKey()));
} else {
counts.put(e.getKey(), e.getValue());
}
}
for (String link : result.getLinks()) {
crawlInternal(link, deadline, maxDepth - 1, counts, visitedUrls);
}
}
//#
@Override
public int getMaxParallelism() {
return Runtime.getRuntime().availableProcessors();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ final class SequentialWebCrawler implements WebCrawler {
private final int popularWordCount;
private final int maxDepth;
private final List<Pattern> ignoredUrls;

@Inject
SequentialWebCrawler(
Clock clock,
Expand Down Expand Up @@ -58,7 +58,6 @@ public CrawlResult crawl(List<String> startingUrls) {
.setUrlsVisited(visitedUrls.size())
.build();
}

return new CrawlResult.Builder()
.setWordCounts(WordCounts.sort(counts, popularWordCount))
.setUrlsVisited(visitedUrls.size())
Expand All @@ -71,6 +70,7 @@ private void crawlInternal(
int maxDepth,
Map<String, Integer> counts,
Set<String> visitedUrls) {

if (maxDepth == 0 || clock.instant().isAfter(deadline)) {
return;
}
Expand All @@ -83,6 +83,7 @@ private void crawlInternal(
return;
}
visitedUrls.add(url);

PageParser.Result result = parserFactory.get(url).parse();
for (Map.Entry<String, Integer> e : result.getWordCounts().entrySet()) {
if (counts.containsKey(e.getKey())) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
package com.udacity.webcrawler;

import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import java.util.stream.Collector;
import java.util.stream.Collectors;

import com.google.common.collect.Multiset.Entry;

/**
* Utility class that sorts the map of word counts.
Expand All @@ -28,18 +35,48 @@ final class WordCounts {
static Map<String, Integer> sort(Map<String, Integer> wordCounts, int popularWordCount) {

// TODO: Reimplement this method using only the Stream API and lambdas and/or method references.


//PriorityQueue
PriorityQueue<Map.Entry<String, Integer>> sortedCounts =
new PriorityQueue<>(wordCounts.size(), new WordCountComparator());
sortedCounts.addAll(wordCounts.entrySet());

sortedCounts.addAll(wordCounts.entrySet()); // mean get string from the map wordCounts

//innit
Map<String, Integer> topCounts = new LinkedHashMap<>();

//for to check element
for (int i = 0; i < Math.min(popularWordCount, wordCounts.size()); i++) {
Map.Entry<String, Integer> entry = sortedCounts.poll();

Map.Entry<String, Integer> entry = sortedCounts.poll();// poll() get head and delete this head

//
topCounts.put(entry.getKey(), entry.getValue());
}
return topCounts;

// return wordCounts.entrySet().stream()
//// .sorted((o1, o2)->o1.getItem().getValue().compareTo(o2.getItem().getValue()))
// .sorted((o1, o2) -> compare(o1, o2))
// .collect(Collectors.toMap(Map.Entry::getKey,
// e -> e.getValue()
// .stream()
// .mapToInt(Vote::getVoteValue)
// .sum()))
// ;
}

static Map<String, Integer> supportFunc(String string, Integer stringInt,int sizeWordCounts, int popularWordCount) {


Map<String, Integer> topCounts = new LinkedHashMap<>();
for (int i = 0; i < Math.min(popularWordCount, sizeWordCounts); i++) {
//
topCounts.put(string, stringInt);
}

return topCounts;
}

/**
* A {@link Comparator} that sorts word count pairs correctly:
*
Expand All @@ -59,6 +96,12 @@ public int compare(Map.Entry<String, Integer> a, Map.Entry<String, Integer> b) {
if (a.getKey().length() != b.getKey().length()) {
return b.getKey().length() - a.getKey().length();
}
/*
this.compareTo(that)
a negative int if this < that
0 if this == that
a positive int if this > that
*/
return a.getKey().compareTo(b.getKey());
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package com.udacity.webcrawler.json;

import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
* A static utility class that loads a JSON configuration file.
*/
Expand All @@ -25,8 +30,14 @@ public ConfigurationLoader(Path path) {
*/
public CrawlerConfiguration load() {
// TODO: Fill in this method.

return new CrawlerConfiguration.Builder().build();
try(Reader reader = Files.newBufferedReader(path)){
return read(reader);
} catch (IOException e) {
e.getLocalizedMessage();
return null;
}
// this line can comment - video 00:32:00
// return new CrawlerConfiguration.Builder().build();
}

/**
Expand All @@ -39,7 +50,16 @@ public static CrawlerConfiguration read(Reader reader) {
// This is here to get rid of the unused variable warning.
Objects.requireNonNull(reader);
// TODO: Fill in this method

return new CrawlerConfiguration.Builder().build();
ObjectMapper objectMap = new ObjectMapper();
objectMap.disable(JsonParser.Feature.AUTO_CLOSE_SOURCE);

CrawlerConfiguration.Builder crawlerConfigutationBuilder = null;
try {
crawlerConfigutationBuilder = objectMap.readValue(reader, CrawlerConfiguration.Builder.class);
return crawlerConfigutationBuilder.build();
} catch (IOException e) {
e.getLocalizedMessage();
return null;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
package com.udacity.webcrawler.json;

import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.exc.StreamWriteException;
import com.fasterxml.jackson.databind.DatabindException;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
* Utility class to write a {@link CrawlResult} to file.
*/
Expand All @@ -29,6 +38,14 @@ public void write(Path path) {
// This is here to get rid of the unused variable warning.
Objects.requireNonNull(path);
// TODO: Fill in this method.
Writer writer;
try {
writer = Files.newBufferedWriter(path);
write(writer);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

/**
Expand All @@ -40,5 +57,20 @@ public void write(Writer writer) {
// This is here to get rid of the unused variable warning.
Objects.requireNonNull(writer);
// TODO: Fill in this method.
ObjectMapper objectMap = new ObjectMapper();

try {
objectMap.writeValue(writer, CrawlerConfiguration.Builder.class);
objectMap.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
} catch (StreamWriteException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (DatabindException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Loading