Calculate word count using wordcount.lua filter

pelican-plugins · Nov 23, 2023 · efea9ac · efea9ac
1 parent 89b78d5
commit efea9ac
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -232,7 +232,7 @@ The default value for reading speed is set to 200 words per minute, but may be c
 READING_SPEED = <words-per-minute>
 ```
 
-The number of words in a document is calculated using the [Markdown Word Count](https://github.com/gandreadis/markdown-word-count) package.
+The number of words in a document is calculated using the [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount).
 
 ### Customizing the Path for the `pandoc` Executable
 

diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,3 @@
+Release type: minor
+
+* Using [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount) instead of the [markdown-word-count](https://github.com/gandreadis/markdown-word-count) Python package to calculate word count
diff --git a/pelican/plugins/pandoc_reader/filters/wordcount.lua b/pelican/plugins/pandoc_reader/filters/wordcount.lua
@@ -0,0 +1,56 @@
+-- counts words in a document
+
+words = 0
+characters = 0
+characters_and_spaces = 0
+process_anyway = false
+
+wordcount = {
+  Str = function(el)
+    -- we don't count a word if it's entirely punctuation:
+    if el.text:match("%P") then
+        words = words + 1
+    end
+    characters = characters + utf8.len(el.text)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end,
+
+  Space = function(el)
+    characters_and_spaces = characters_and_spaces + 1
+  end,
+
+  Code = function(el)
+    _,n = el.text:gsub("%S+","")
+    words = words + n
+    text_nospace = el.text:gsub("%s", "")
+    characters = characters + utf8.len(text_nospace)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end,
+
+  CodeBlock = function(el)
+    _,n = el.text:gsub("%S+","")
+    words = words + n
+    text_nospace = el.text:gsub("%s", "")
+    characters = characters + utf8.len(text_nospace)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end
+}
+
+-- check if the `wordcount` variable is set to `process-anyway`
+function Meta(meta)
+  if meta.wordcount and (meta.wordcount=="process-anyway"
+    or meta.wordcount=="process" or meta.wordcount=="convert") then
+      process_anyway = true
+  end
+end
+
+function Pandoc(el)
+    -- skip metadata, just count body:
+    pandoc.walk_block(pandoc.Div(el.blocks), wordcount)
+    print(words .. " words in body")
+    print(characters .. " characters in body")
+    print(characters_and_spaces .. " characters in body (including spaces)")
+    if not process_anyway then
+      os.exit(0)
+    end
+end
diff --git a/pelican/plugins/pandoc_reader/pandoc_reader.py b/pelican/plugins/pandoc_reader/pandoc_reader.py
@@ -6,7 +6,6 @@
 import subprocess
 
 import bs4
-from mwc.counter import count_words_in_markdown
 from ruamel.yaml import YAML, constructor
 
 from pelican import signals
@@ -22,6 +21,7 @@
     "%7Bfilename%7D": "{filename}",
 }
 FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"]
+FILTERS_PATH = os.path.abspath(os.path.join(DIR_PATH, "filters"))
 PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html"
 PANDOC_SUPPORTED_MAJOR_VERSION = 2
 PANDOC_SUPPORTED_MINOR_VERSION = 11
@@ -128,7 +128,8 @@ def _create_html(self, source_path, content, pandoc_executable):
         if self.settings.get("CALCULATE_READING_TIME", []):
             # Calculate reading time and add to metadata
             metadata["reading_time"] = self.process_metadata(
-                "reading_time", self._calculate_reading_time(content)
+                "reading_time",
+                self._calculate_reading_time(pandoc_executable, source_path),
             )
 
         return output, metadata
@@ -200,10 +201,28 @@ def _check_defaults(self, defaults_files):
 
         return citations, table_of_contents
 
-    def _calculate_reading_time(self, content):
+    def _calculate_reading_time(self, pandoc_executable, source_path):
         """Calculate time taken to read content."""
         reading_speed = self.settings.get("READING_SPEED", DEFAULT_READING_SPEED)
-        wordcount = count_words_in_markdown(content)
+
+        # Use the workcount.lua filter to calulcate the reading time
+        output = subprocess.run(
+            [
+                pandoc_executable,
+                "--lua-filter",
+                os.path.join(FILTERS_PATH, "wordcount.lua"),
+                source_path,
+            ],
+            capture_output=True,
+            encoding="utf-8",
+            check=True,
+        )
+
+        # We have to extract the word count from stdout which looks like
+        # 102 words in body
+        # 536 characters in body
+        # 636 characters in body (including spaces)
+        wordcount = output.stdout.split()[0]
 
         time_unit = "minutes"
         try:

diff --git a/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md b/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md
@@ -6,4 +6,4 @@ date: "2020-10-16"
 
 ## What is Lorem Ipsum
 
-Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
+Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Lorep Ipsum paragragh should be 100 words.
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,6 @@ classifiers = [
 python = ">=3.8.1,<4.0"
 pelican = ">=4.5"
 markdown = {version = "<=3.3.4", optional = true}
-markdown-word-count = "^0.0.1"
 pyyaml = "^6.0.0"
 beautifulsoup4 = "^4.9.3"
 "ruamel.yaml" = "^0.17.32"