udacity · tmthach · Oct 29, 2023
@@ -1,18 +1,23 @@
 package com.udacity.webcrawler;
 
 import com.udacity.webcrawler.json.CrawlResult;
+import com.udacity.webcrawler.parser.PageParser;
+import com.udacity.webcrawler.parser.PageParserFactory;
 
 import javax.inject.Inject;
 import javax.inject.Provider;
 import java.time.Clock;
 import java.time.Duration;
 import java.time.Instant;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.concurrent.ForkJoinPool;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
@@ -21,27 +26,85 @@
  */
 final class ParallelWebCrawler implements WebCrawler {
   private final Clock clock;
+  private final PageParserFactory parserFactory;
   private final Duration timeout;
   private final int popularWordCount;
   private final ForkJoinPool pool;
+  private List<Pattern> ignoredUrls = null;
+  private final int maxDepth;
 
   @Inject
   ParallelWebCrawler(
       Clock clock,
+      PageParserFactory parserFactory,
       @Timeout Duration timeout,
       @PopularWordCount int popularWordCount,
-      @TargetParallelism int threadCount) {
+      @TargetParallelism int threadCount,
+      @MaxDepth int maxDepth,
+      @IgnoredUrls List<Pattern> ignoredUrls) {
     this.clock = clock;
+    this.parserFactory = parserFactory;
     this.timeout = timeout;
     this.popularWordCount = popularWordCount;
     this.pool = new ForkJoinPool(Math.min(threadCount, getMaxParallelism()));
+    this.maxDepth = maxDepth;
+    this.ignoredUrls = ignoredUrls;
   }
 
   @Override
   public CrawlResult crawl(List<String> startingUrls) {
-    return new CrawlResult.Builder().build();
+	  Instant deadline = clock.instant().plus(timeout);
+	  Map<String, Integer> counts = new HashMap<>();
+	  Set<String> visitedUrls = new HashSet<>();
+
+	  for (String url : startingUrls) {
+		  crawlInternal(url, deadline, maxDepth, counts, visitedUrls);
+	  }
+	  if (counts.isEmpty()) {
+		  return new CrawlResult.Builder()
+		      .setWordCounts(counts)
+		      .setUrlsVisited(visitedUrls.size())
+		      .build();
+		}
+		return new CrawlResult.Builder()
+		    .setWordCounts(WordCounts.sort(counts, popularWordCount))
+		    .setUrlsVisited(visitedUrls.size())
+		    .build();
   }
-
+  //#
+  private void crawlInternal(
+	      String url,
+	      Instant deadline,
+	      int maxDepth,
+	      Map<String, Integer> counts,
+	      Set<String> visitedUrls) {
+
+	    if (maxDepth == 0 || clock.instant().isAfter(deadline)) {
+	      return;
+	    }
+	    for (Pattern pattern : ignoredUrls) {
+	      if (pattern.matcher(url).matches()) {
+	        return;
+	      }
+	    }
+	    if (visitedUrls.contains(url)) {
+	      return;
+	    }
+	    visitedUrls.add(url);
+
+	    PageParser.Result result = parserFactory.get(url).parse();
+	    for (Map.Entry<String, Integer> e : result.getWordCounts().entrySet()) {
+	      if (counts.containsKey(e.getKey())) {
+	        counts.put(e.getKey(), e.getValue() + counts.get(e.getKey()));
+	      } else {
+	        counts.put(e.getKey(), e.getValue());
+	      }
+	    }
+	    for (String link : result.getLinks()) {
+	      crawlInternal(link, deadline, maxDepth - 1, counts, visitedUrls);
+	    }
+	  }
+  //#
   @Override
   public int getMaxParallelism() {
     return Runtime.getRuntime().availableProcessors();

@@ -26,7 +26,7 @@ final class SequentialWebCrawler implements WebCrawler {
   private final int popularWordCount;
   private final int maxDepth;
   private final List<Pattern> ignoredUrls;
-
+  
   @Inject
   SequentialWebCrawler(
       Clock clock,
@@ -58,7 +58,6 @@ public CrawlResult crawl(List<String> startingUrls) {
           .setUrlsVisited(visitedUrls.size())
           .build();
     }
-
     return new CrawlResult.Builder()
         .setWordCounts(WordCounts.sort(counts, popularWordCount))
         .setUrlsVisited(visitedUrls.size())
@@ -71,6 +70,7 @@ private void crawlInternal(
       int maxDepth,
       Map<String, Integer> counts,
       Set<String> visitedUrls) {
+
     if (maxDepth == 0 || clock.instant().isAfter(deadline)) {
       return;
     }
@@ -83,6 +83,7 @@ private void crawlInternal(
       return;
     }
     visitedUrls.add(url);
+
     PageParser.Result result = parserFactory.get(url).parse();
     for (Map.Entry<String, Integer> e : result.getWordCounts().entrySet()) {
       if (counts.containsKey(e.getKey())) {

@@ -1,9 +1,16 @@
 package com.udacity.webcrawler;
 
 import java.util.Comparator;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.PriorityQueue;
+import java.util.Queue;
+import java.util.Set;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Multiset.Entry;
 
 /**
  * Utility class that sorts the map of word counts.
@@ -28,18 +35,48 @@ final class WordCounts {
   static Map<String, Integer> sort(Map<String, Integer> wordCounts, int popularWordCount) {
 
     // TODO: Reimplement this method using only the Stream API and lambdas and/or method references.
-
+
+	//PriorityQueue 
     PriorityQueue<Map.Entry<String, Integer>> sortedCounts =
         new PriorityQueue<>(wordCounts.size(), new WordCountComparator());
-    sortedCounts.addAll(wordCounts.entrySet());
+
+    sortedCounts.addAll(wordCounts.entrySet()); // mean get string from the map wordCounts
+
+    //innit
     Map<String, Integer> topCounts = new LinkedHashMap<>();
+
+    //for to check element 
     for (int i = 0; i < Math.min(popularWordCount, wordCounts.size()); i++) {
-      Map.Entry<String, Integer> entry = sortedCounts.poll();
+
+      Map.Entry<String, Integer> entry = sortedCounts.poll();// poll() get head and delete this head
+
+      //
       topCounts.put(entry.getKey(), entry.getValue());
     }
     return topCounts;
+
+//    return wordCounts.entrySet().stream()
+////    		.sorted((o1, o2)->o1.getItem().getValue().compareTo(o2.getItem().getValue()))
+//    		.sorted((o1, o2) -> compare(o1, o2))
+//    		.collect(Collectors.toMap(Map.Entry::getKey,
+//                    e -> e.getValue()
+//                    .stream()
+//                    .mapToInt(Vote::getVoteValue)
+//                    .sum()))
+//    		;
+  }
+
+  static Map<String, Integer> supportFunc(String string, Integer stringInt,int sizeWordCounts, int popularWordCount) {
+
+
+	  Map<String, Integer> topCounts = new LinkedHashMap<>();
+	  for (int i = 0; i < Math.min(popularWordCount, sizeWordCounts); i++) {
+	      //
+	      topCounts.put(string, stringInt);
+	  }
+
+	  return topCounts;
   }
-
   /**
    * A {@link Comparator} that sorts word count pairs correctly:
    *
@@ -59,6 +96,12 @@ public int compare(Map.Entry<String, Integer> a, Map.Entry<String, Integer> b) {
       if (a.getKey().length() != b.getKey().length()) {
         return b.getKey().length() - a.getKey().length();
       }
+      /*
+         this.compareTo(that)
+         a negative int if this < that
+		 0 if this == that
+		 a positive int if this > that
+       */
       return a.getKey().compareTo(b.getKey());
     }
   }

@@ -1,9 +1,14 @@
 package com.udacity.webcrawler.json;
 
+import java.io.IOException;
 import java.io.Reader;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Objects;
 
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 /**
  * A static utility class that loads a JSON configuration file.
  */
@@ -25,8 +30,14 @@ public ConfigurationLoader(Path path) {
    */
   public CrawlerConfiguration load() {
     // TODO: Fill in this method.
-
-    return new CrawlerConfiguration.Builder().build();
+	  try(Reader reader = Files.newBufferedReader(path)){
+		  return read(reader);
+	  } catch (IOException e) {
+		  e.getLocalizedMessage();
+		  return null;
+	  }
+//	  this line can comment - video 00:32:00
+//    return new CrawlerConfiguration.Builder().build();
   }
 
   /**
@@ -39,7 +50,16 @@ public static CrawlerConfiguration read(Reader reader) {
     // This is here to get rid of the unused variable warning.
     Objects.requireNonNull(reader);
     // TODO: Fill in this method
-
-    return new CrawlerConfiguration.Builder().build();
+    ObjectMapper objectMap = new ObjectMapper();
+    objectMap.disable(JsonParser.Feature.AUTO_CLOSE_SOURCE);
+
+    CrawlerConfiguration.Builder crawlerConfigutationBuilder = null;
+    try { 
+    	crawlerConfigutationBuilder = objectMap.readValue(reader, CrawlerConfiguration.Builder.class);
+    	return crawlerConfigutationBuilder.build();
+    } catch (IOException e) {
+    	e.getLocalizedMessage();
+    	return null;
+    }
   }
 }
@@ -1,9 +1,18 @@
 package com.udacity.webcrawler.json;
 
+import java.io.IOException;
+import java.io.Reader;
 import java.io.Writer;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Objects;
 
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.exc.StreamWriteException;
+import com.fasterxml.jackson.databind.DatabindException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 /**
  * Utility class to write a {@link CrawlResult} to file.
  */
@@ -29,6 +38,14 @@ public void write(Path path) {
     // This is here to get rid of the unused variable warning.
     Objects.requireNonNull(path);
     // TODO: Fill in this method.
+    Writer writer;
+	try {
+		writer = Files.newBufferedWriter(path);
+		write(writer);
+	} catch (IOException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	}
   }
 
   /**
@@ -40,5 +57,20 @@ public void write(Writer writer) {
     // This is here to get rid of the unused variable warning.
     Objects.requireNonNull(writer);
     // TODO: Fill in this method.
+    ObjectMapper objectMap = new ObjectMapper();
+
+    try {
+		objectMap.writeValue(writer, CrawlerConfiguration.Builder.class);
+		objectMap.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
+	} catch (StreamWriteException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	} catch (DatabindException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	} catch (IOException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	}
   }
 }