ns chapter-2-input-output.2-1-loading-data
(:nextjournal.clerk/visibility {:code :hide}
{:nextjournal.clerk/toc true}
:require [scicloj.kind-clerk.api :as kind-clerk])) (
(kind-clerk/setup!)
:ok
doall
here ;; #### With tablecloth ;; For most work involving tabular/columnar data, you’ll use tablecloth, Clojure’s go-to data ;; wrangling library. These all return a tech.ml.dataset Dataset
object. The implementation ;; details aren’t important now, but tech.ml.dataset
is the library that allows for efficient ;; and fast operations on columnar datasets. ;; TODO: Be consistent about you vs we – pick on and stick with it (require ‘[tablecloth.api :as tc]) (require’[nextjournal.clerk :as clerk]) ;; (clerk/add-viewers! [{:pred #(= tech.v3.dataset.impl.dataset.Dataset (type %)) ;; ;; :fetch-fn (fn [_ file] {:nextjournal/content-type “image/png” ;; ;; :nextjournal/value (Files/readAllBytes (.toPath file))}) ;; :render-fn v/table}]) (-> “data/co2_over_time.csv” tc/dataset) ;; Note the built-in pretty printing. ;; TODO: Write elsewhere about kindly and notebooks, how they know how to render different things ;; Easy things to tidy up at import time: ;; ##### Transforming headers ;; We’ll require Clojure’s standard string library for this example. The transformation function is ;; arbitrary though, accepting a single header value and returning a single, transformed value. (require ‘[clojure.string :as str]) (defn- lower-case-keyword [val] (-> val (str/replace #“+” “-”) str/lower-case keyword)) (-> “data/co2_over_time.csv” (tc/dataset {:key-fn lower-case-keyword})) ;; ##### Specifying separators ;; Tablecloth is pretty smart about standard formats, e.g. CSV above and TSV: (-> “data/co2_over_time.tsv” tc/dataset) ;; But it can also accept an arbitrary separator if for some reason you have some data that uses ;; a non-standard file format (have a look at data/co2_over_time.txt
). Note the separator has to ;; be a single character. (-> “data/co2_over_time.txt” (tc/dataset {:separator “/”})) ;; ##### Specify file encoding ;; TODO: does this really matter? test out different file encodings.. ;; ##### Normalize values into consistent formats and types ;; Tablecloth makes it easy to apply arbitrary transformations to all values in a given column ;; We can inspect the column metadata with tablecloth: (def dataset (tc/dataset “data/co2_over_time.csv”)) (-> dataset (tc/info :columns)) ;; Certain types are built-in (it knows what to do to convert them, e.g. numbers:) ;; TODO: Explain why numbers get rounded? Probably not here.. in addendum about numbers in Clojure (-> dataset (tc/convert-types “CO2” :double) (tc/info :columns)) ;; The full list of magic symbols representing types tablecloth supports comes from the underlying ;; tech.ml.dataset
library: (require’[tech.v3.datatype.casting :as casting]) @casting/valid-datatype-set ;; More details on supported types here. ;; TODO: Explain when to use :double vs :type/numerical? What’s the difference? ;; You can also process multiple columns at once, either by specifying a map of columns to data types: (-> dataset (tc/convert-types {“CO2” :double “adjusted CO2” :double}) (tc/info :columns)) ;; Or by changing all columns of a certain type to another: (-> dataset (tc/convert-types :type/numerical :double) (tc/info :columns)) ;; The supported column types are: ;; :type/numerical - any numerical type ;; :type/float - floating point number (:float32 and :float64) ;; :type/integer - any integer ;; :type/datetime - any datetime type ;; Also the magical :!type
qualifier exists, which will select the complement set – all columns that ;; are not the specified type ;; For others you need to provide a casting function yourself, e.g. adding the UTC start of day, ;; accounting for local daylight savings (defn to-start-of-day-UTC [local-date] (-> local-date .atStartOfDay (java.time.ZonedDateTime/ofLocal (java.time.ZoneId/systemDefault) (java.time.ZoneOffset/UTC)))) (-> dataset (tc/convert-types “Date” [[:timezone-date to-start-of-day-UTC]]) (tc/info :columns)) ;; For full details on all the possible options for type conversion of columns see the ;; tablecloth API docs ;; ### Reading from a URL ;; CSV: (-> “https://vega.github.io/vega-lite/data/co2-concentration.csv” tc/dataset) ;; JSON: works as long as the data is an array of maps (-> “https://vega.github.io/vega-lite/data/cars.json” tc/dataset) ;; Tablecloth can handle a string that points to any file that contains either raw or gzipped csv/tsv, ;; json, xls(x), on the local file system or a URL. ;; ### Reading an excel file ;; Tablecloth supports reading xls and xlsx files iff the underlying Java library for working with ;; excel is included: (require ‘[tech.v3.libs.poi]) ;; This is not included in the library by default because poi
has a hard dependency on log4j2, along ;; with many other dependencies that the core team at tech.ml.dataset
(upon which tablecloth is built) ;; did not want to impose on all users by default (https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/working.20with.20excel.20files/near/314711378). ;; You can still require it here, you’ll most likely just see an error that says something like ;; “Log4j2 could not find a logging implementation. Please add log4j-core to the classpath.”, unless ;; you already have a valid log4j config on your class path. ;; This should work according to maintainers, does not atm (tc/dataset “data/example_XLS.xls” {:filetype “xls”}) (tc/dataset “data/example_XLSX.xlsx” {:filetype “xlsx”}) (require’[dk.ative.docjure.spreadsheet :as xl]) (def xl-workbook (xl/load-workbook “data/example_XLS.xls”)) ;; To discover sheet names: (->> xl-workbook xl/sheet-seq (map xl/sheet-name)) ;; This will show us there is only one sheet in this workbook, named “Sheet1”. You can get the data ;; out of it like this: ;; To discover header names: (def headers (->> xl-workbook (xl/select-sheet “Sheet1”) xl/row-seq first xl/cell-seq (map xl/read-cell))) ;; To get the data out of the columns: (def column-index->header (zipmap [:A :B :C :D :E :F :G :H :I] headers)) (->> xl-workbook (xl/select-sheet “Sheet1”) (xl/select-columns column-index->header)) ;; and into a tablecloth dataset like this: (->> xl-workbook (xl/select-sheet “Sheet1”) (xl/select-columns column-index->header) (drop 1) ;; don’t count the header row as a row tc/dataset) ;; You might be tempted to just iterate over each row and read each cell, but it’s more ;; convenient to think of the data as column-based rather than row-based for tablecloth’s purposes. ;; Setting the dataset headers is more verbose when we’re starting from a seq of seqs, since ;; the header-row?
option does not work for a seq of seqs (this option is implemented in the ;; low-level parsing code for each supported input type and is not currently implemented for ;; a seq of seqs). (def iterated-xl-data (->> xl-workbook (xl/select-sheet “Sheet1”) xl/row-seq (map #(->> % xl/cell-seq (map xl/read-cell))))) ;; Note the header-row?
option is not supported: (tc/dataset iterated-xl-data {:header-row? true}) ;; Can do it manually, but just working with columns from the start is more idiomatic: (let [headers (first iterated-xl-data) rows (rest iterated-xl-data)] (map #(zipmap headers %) rows)) ;; ### Reading from a database ;; #### SQL database ;; (tc/dataset (,,, results from some SQL query)) ;; requires com.github.seancorfield/next.jdbc {:mvn/version "1.3.847"}
in deps.edn
;; Note you will also require the relevant driver for the type of db you are trying ;; to access. These are some available ones: (require ‘[next.jdbc :as jdbc]) ;; Connect to the db: (def db {:dbname “data/Chinook_Sqlite.sqlite” :dbtype “sqlite”}) (def ds (jdbc/get-datasource db)) ds ;; Pass the results of a sql query to tablecloth to make a (-> ds (jdbc/execute! [“SELECT * FROM artist”]) (tc/dataset)) ;; Passing a parameter to a query (-> ds (jdbc/execute! [“SELECT * FROM artist WHERE Name = ?” “Aerosmith”]) (tc/dataset)) ;; note for SQLite specifically the concat operator is ||
not +
(-> ds (jdbc/execute! [“SELECT * FROM artist WHERE Name like ‘%’ || ? || ‘%’” “man”]) (tc/dataset)) ;; #### SPARQL database (require’[grafter-2.rdf4j.repository :as repo]) (require ‘[grafter-2.rdf.protocols :as pr]) (def sparql (repo/sparql-repo “https://query.wikidata.org/sparql”)) ;; taken from: https://query.wikidata.org/#%23Public%20sculptures%20in%20Paris%0ASELECT%20DISTINCT%20%3Fitem%20%20%3FTitre%20%3Fcreateur%20%28year%28%3Fdate%29%20as%20%3FAnneeCreation%29%20%3Fimage%20%3Fcoord%0AWHERE%0A%7B%0A%20%20%20%3Fitem%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ860861.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20sculpture%0A%20%20%20%3Fitem%20wdt%3AP136%20wd%3AQ557141%20.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20genre%C2%A0%3A%20art%20public%0A%20%20%20%7B%3Fitem%20wdt%3AP131%20wd%3AQ90.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20…%20situ%C3%A9e%20dans%20Paris%0A%20%20%20UNION%0A%20%20%20%7B%3Fitem%20wdt%3AP131%20%3Farr.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20…%20ou%20dans%20un%20arrondissement%20de%20Paris%20%0A%20%20%20%3Farr%20wdt%3AP131%20wd%3AQ90.%20%7D%0A%20%20%20%3Fitem%20rdfs%3Alabel%20%3FTitre%20FILTER%20%28lang%28%3FTitre%29%20%3D%20%22fr%22%29.%20%20%23%20Titre%0A%20%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP170%20%3FQcreateur.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20cr%C3%A9ateur%2Fcr%C3%A9atrice%20%28option%29%0A%20%20%20%3FQcreateur%20rdfs%3Alabel%20%3Fcreateur%20FILTER%20%28lang%28%3Fcreateur%29%20%3D%20%22fr%22%29%20.%7D%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP571%20%3Fdate.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20date%20de%20cr%C3%A9ation%20%28option%29%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP18%20%20%3Fimage.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20image%20%28option%29%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP625%20%3Fcoord.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20coordonn%C3%A9es%20g%C3%A9ographiques%20%28option%29%0A%7D (def sparql-results (let [conn (repo/->connection sparql)] (-> conn (repo/query “# Public sculptures in Paris SELECT DISTINCT ?item ?title ?creator (year(?date) as ?year) ?coord WHERE { ?item wdt:P31/wdt:P279* wd:Q860861. # sculpture ?item wdt:P136 wd:Q557141 . # genre : art public {?item wdt:P131 wd:Q90.} # … située dans Paris UNION {?item wdt:P131 ?arr. # … ou dans un arrondissement de Paris ?arr wdt:P131 wd:Q90. } ?item rdfs:label ?title FILTER (lang(?title) = "fr"). # title OPTIONAL {?item wdt:P170 ?Qcreateur. # créateur/créatrice (option) ?Qcreateur rdfs:label ?creator FILTER (lang(?creator) = "fr") .} OPTIONAL {?item wdt:P571 ?date.} # date de création (option) OPTIONAL {?item wdt:P18 ?image.} # image (option) OPTIONAL {?item wdt:P625 ?coord.} # coordonnées géographiques (option) }”)))) ;; grafter db can help format RDF values (def sparql-ds (-> sparql-results tc/dataset (tc/update-columns [:coord :title :creator] (partial map pr/raw-value)))) ;; ### Generating sequences (defn seq-of-seqs [rows cols-per-row output-generator] (repeatedly rows (partial repeatedly cols-per-row output-generator))) ;; Of random numbers: (defn random-number-between-0-1000 [] (rand-int 1000)) (seq-of-seqs 10 4 random-number-between-0-1000) (defn seq-of-maps [rows cols-per-row output-generator] (let [header-data (map #(str “header-” %) (range cols-per-row)) row-data (seq-of-seqs rows cols-per-row output-generator)] (map #(zipmap header-data %) row-data))) (seq-of-maps 10 4 random-number-between-0-1000) ;; dtype next (library underneath tech.ml.dataset, which is underneath tablecloth) also ;; has a built-in sequence generator: (require’[tech.v3.datatype :as dtype]) (dtype/make-reader :string 4 (str “cell-” idx)) (dtype/make-reader :int32 4 (rand-int 10)) ;; It is lazy, not cached, so be careful about using a computationally-heavy fn for generator ;; ### Generating repeatable sequences of dummy data (def consistent-data (map-indexed (fn [index _coll] (str “cell-” index)) (range 10))) (repeat (zipmap (range 10) consistent-data)) :end
+;; This is a work in progress of the code examples that will make up chapter 2, section 1 ;; of the Clojure data cookbook ;; # 2.1 How to get data into the notebook ;; ## How to get data into the notebook ;; ### Reading from a delimited text file ;; Easiest with standard file formats, e.g. CSV. ;; #### With Clojure’s standard CSV library (require ‘[clojure.data.csv :as csv]’[clojure.java.io :as io]) ^{:nextjournal.clerk/viewer :table} (with-open [reader (io/reader “data/co2_over_time.csv”)] (doall (csv/read-csv reader))) ;; Returns: Lazy sequence of vectors of strings (one value per cell) ;; TODO: Link to useful explainer on lazy seqs, explain why we include doall
here ;; #### With tablecloth ;; For most work involving tabular/columnar data, you’ll use tablecloth, Clojure’s go-to data ;; wrangling library. These all return a tech.ml.dataset Dataset
object. The implementation ;; details aren’t important now, but tech.ml.dataset
is the library that allows for efficient ;; and fast operations on columnar datasets. ;; TODO: Be consistent about you vs we – pick on and stick with it (require ‘[tablecloth.api :as tc]) (require’[nextjournal.clerk :as clerk]) ;; (clerk/add-viewers! [{:pred #(= tech.v3.dataset.impl.dataset.Dataset (type %)) ;; ;; :fetch-fn (fn [_ file] {:nextjournal/content-type “image/png” ;; ;; :nextjournal/value (Files/readAllBytes (.toPath file))}) ;; :render-fn v/table}]) (-> “data/co2_over_time.csv” tc/dataset) ;; Note the built-in pretty printing. ;; TODO: Write elsewhere about kindly and notebooks, how they know how to render different things ;; Easy things to tidy up at import time: ;; ##### Transforming headers ;; We’ll require Clojure’s standard string library for this example. The transformation function is ;; arbitrary though, accepting a single header value and returning a single, transformed value. (require ‘[clojure.string :as str]) (defn- lower-case-keyword [val] (-> val (str/replace #“+” “-”) str/lower-case keyword)) (-> “data/co2_over_time.csv” (tc/dataset {:key-fn lower-case-keyword})) ;; ##### Specifying separators ;; Tablecloth is pretty smart about standard formats, e.g. CSV above and TSV: (-> “data/co2_over_time.tsv” tc/dataset) ;; But it can also accept an arbitrary separator if for some reason you have some data that uses ;; a non-standard file format (have a look at data/co2_over_time.txt
). Note the separator has to ;; be a single character. (-> “data/co2_over_time.txt” (tc/dataset {:separator “/”})) ;; ##### Specify file encoding ;; TODO: does this really matter? test out different file encodings.. ;; ##### Normalize values into consistent formats and types ;; Tablecloth makes it easy to apply arbitrary transformations to all values in a given column ;; We can inspect the column metadata with tablecloth: (def dataset (tc/dataset “data/co2_over_time.csv”)) (-> dataset (tc/info :columns)) ;; Certain types are built-in (it knows what to do to convert them, e.g. numbers:) ;; TODO: Explain why numbers get rounded? Probably not here.. in addendum about numbers in Clojure (-> dataset (tc/convert-types “CO2” :double) (tc/info :columns)) ;; The full list of magic symbols representing types tablecloth supports comes from the underlying ;; tech.ml.dataset
library: (require’[tech.v3.datatype.casting :as casting]) @casting/valid-datatype-set ;; More details on supported types here. ;; TODO: Explain when to use :double vs :type/numerical? What’s the difference? ;; You can also process multiple columns at once, either by specifying a map of columns to data types: (-> dataset (tc/convert-types {“CO2” :double “adjusted CO2” :double}) (tc/info :columns)) ;; Or by changing all columns of a certain type to another: (-> dataset (tc/convert-types :type/numerical :double) (tc/info :columns)) ;; The supported column types are: ;; :type/numerical - any numerical type ;; :type/float - floating point number (:float32 and :float64) ;; :type/integer - any integer ;; :type/datetime - any datetime type ;; Also the magical :!type
qualifier exists, which will select the complement set – all columns that ;; are not the specified type ;; For others you need to provide a casting function yourself, e.g. adding the UTC start of day, ;; accounting for local daylight savings (defn to-start-of-day-UTC [local-date] (-> local-date .atStartOfDay (java.time.ZonedDateTime/ofLocal (java.time.ZoneId/systemDefault) (java.time.ZoneOffset/UTC)))) (-> dataset (tc/convert-types “Date” [[:timezone-date to-start-of-day-UTC]]) (tc/info :columns)) ;; For full details on all the possible options for type conversion of columns see the ;; tablecloth API docs ;; ### Reading from a URL ;; CSV: (-> “https://vega.github.io/vega-lite/data/co2-concentration.csv” tc/dataset) ;; JSON: works as long as the data is an array of maps (-> “https://vega.github.io/vega-lite/data/cars.json” tc/dataset) ;; Tablecloth can handle a string that points to any file that contains either raw or gzipped csv/tsv, ;; json, xls(x), on the local file system or a URL. ;; ### Reading an excel file ;; Tablecloth supports reading xls and xlsx files iff the underlying Java library for working with ;; excel is included: (require ‘[tech.v3.libs.poi]) ;; This is not included in the library by default because poi
has a hard dependency on log4j2, along ;; with many other dependencies that the core team at tech.ml.dataset
(upon which tablecloth is built) ;; did not want to impose on all users by default (https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/working.20with.20excel.20files/near/314711378). ;; You can still require it here, you’ll most likely just see an error that says something like ;; “Log4j2 could not find a logging implementation. Please add log4j-core to the classpath.”, unless ;; you already have a valid log4j config on your class path. ;; This should work according to maintainers, does not atm (tc/dataset “data/example_XLS.xls” {:filetype “xls”}) (tc/dataset “data/example_XLSX.xlsx” {:filetype “xlsx”}) (require’[dk.ative.docjure.spreadsheet :as xl]) (def xl-workbook (xl/load-workbook “data/example_XLS.xls”)) ;; To discover sheet names: (->> xl-workbook xl/sheet-seq (map xl/sheet-name)) ;; This will show us there is only one sheet in this workbook, named “Sheet1”. You can get the data ;; out of it like this: ;; To discover header names: (def headers (->> xl-workbook (xl/select-sheet “Sheet1”) xl/row-seq first xl/cell-seq (map xl/read-cell))) ;; To get the data out of the columns: (def column-index->header (zipmap [:A :B :C :D :E :F :G :H :I] headers)) (->> xl-workbook (xl/select-sheet “Sheet1”) (xl/select-columns column-index->header)) ;; and into a tablecloth dataset like this: (->> xl-workbook (xl/select-sheet “Sheet1”) (xl/select-columns column-index->header) (drop 1) ;; don’t count the header row as a row tc/dataset) ;; You might be tempted to just iterate over each row and read each cell, but it’s more ;; convenient to think of the data as column-based rather than row-based for tablecloth’s purposes. ;; Setting the dataset headers is more verbose when we’re starting from a seq of seqs, since ;; the header-row?
option does not work for a seq of seqs (this option is implemented in the ;; low-level parsing code for each supported input type and is not currently implemented for ;; a seq of seqs). (def iterated-xl-data (->> xl-workbook (xl/select-sheet “Sheet1”) xl/row-seq (map #(->> % xl/cell-seq (map xl/read-cell))))) ;; Note the header-row?
option is not supported: (tc/dataset iterated-xl-data {:header-row? true}) ;; Can do it manually, but just working with columns from the start is more idiomatic: (let [headers (first iterated-xl-data) rows (rest iterated-xl-data)] (map #(zipmap headers %) rows)) ;; ### Reading from a database ;; #### SQL database ;; (tc/dataset (,,, results from some SQL query)) ;; requires com.github.seancorfield/next.jdbc {:mvn/version "1.3.847"}
in deps.edn
;; Note you will also require the relevant driver for the type of db you are trying ;; to access. These are some available ones: (require ‘[next.jdbc :as jdbc]) ;; Connect to the db: (def db {:dbname “data/Chinook_Sqlite.sqlite” :dbtype “sqlite”}) (def ds (jdbc/get-datasource db)) ds ;; Pass the results of a sql query to tablecloth to make a (-> ds (jdbc/execute! [“SELECT * FROM artist”]) (tc/dataset)) ;; Passing a parameter to a query (-> ds (jdbc/execute! [“SELECT * FROM artist WHERE Name = ?” “Aerosmith”]) (tc/dataset)) ;; note for SQLite specifically the concat operator is ||
not +
(-> ds (jdbc/execute! [“SELECT * FROM artist WHERE Name like ‘%’ || ? || ‘%’” “man”]) (tc/dataset)) ;; #### SPARQL database (require’[grafter-2.rdf4j.repository :as repo]) (require ‘[grafter-2.rdf.protocols :as pr]) (def sparql (repo/sparql-repo “https://query.wikidata.org/sparql”)) ;; taken from: https://query.wikidata.org/#%23Public%20sculptures%20in%20Paris%0ASELECT%20DISTINCT%20%3Fitem%20%20%3FTitre%20%3Fcreateur%20%28year%28%3Fdate%29%20as%20%3FAnneeCreation%29%20%3Fimage%20%3Fcoord%0AWHERE%0A%7B%0A%20%20%20%3Fitem%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ860861.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20sculpture%0A%20%20%20%3Fitem%20wdt%3AP136%20wd%3AQ557141%20.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20genre%C2%A0%3A%20art%20public%0A%20%20%20%7B%3Fitem%20wdt%3AP131%20wd%3AQ90.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20…%20situ%C3%A9e%20dans%20Paris%0A%20%20%20UNION%0A%20%20%20%7B%3Fitem%20wdt%3AP131%20%3Farr.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20…%20ou%20dans%20un%20arrondissement%20de%20Paris%20%0A%20%20%20%3Farr%20wdt%3AP131%20wd%3AQ90.%20%7D%0A%20%20%20%3Fitem%20rdfs%3Alabel%20%3FTitre%20FILTER%20%28lang%28%3FTitre%29%20%3D%20%22fr%22%29.%20%20%23%20Titre%0A%20%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP170%20%3FQcreateur.%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20cr%C3%A9ateur%2Fcr%C3%A9atrice%20%28option%29%0A%20%20%20%3FQcreateur%20rdfs%3Alabel%20%3Fcreateur%20FILTER%20%28lang%28%3Fcreateur%29%20%3D%20%22fr%22%29%20.%7D%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP571%20%3Fdate.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20date%20de%20cr%C3%A9ation%20%28option%29%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP18%20%20%3Fimage.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20image%20%28option%29%0A%20%20%20OPTIONAL%20%7B%3Fitem%20wdt%3AP625%20%3Fcoord.%7D%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20coordonn%C3%A9es%20g%C3%A9ographiques%20%28option%29%0A%7D (def sparql-results (let [conn (repo/->connection sparql)] (-> conn (repo/query “# Public sculptures in Paris SELECT DISTINCT ?item ?title ?creator (year(?date) as ?year) ?coord WHERE { ?item wdt:P31/wdt:P279* wd:Q860861. # sculpture ?item wdt:P136 wd:Q557141 . # genre : art public {?item wdt:P131 wd:Q90.} # … située dans Paris UNION {?item wdt:P131 ?arr. # … ou dans un arrondissement de Paris ?arr wdt:P131 wd:Q90. } ?item rdfs:label ?title FILTER (lang(?title) = "fr"). # title OPTIONAL {?item wdt:P170 ?Qcreateur. # créateur/créatrice (option) ?Qcreateur rdfs:label ?creator FILTER (lang(?creator) = "fr") .} OPTIONAL {?item wdt:P571 ?date.} # date de création (option) OPTIONAL {?item wdt:P18 ?image.} # image (option) OPTIONAL {?item wdt:P625 ?coord.} # coordonnées géographiques (option) }”)))) ;; grafter db can help format RDF values (def sparql-ds (-> sparql-results tc/dataset (tc/update-columns [:coord :title :creator] (partial map pr/raw-value)))) ;; ### Generating sequences (defn seq-of-seqs [rows cols-per-row output-generator] (repeatedly rows (partial repeatedly cols-per-row output-generator))) ;; Of random numbers: (defn random-number-between-0-1000 [] (rand-int 1000)) (seq-of-seqs 10 4 random-number-between-0-1000) (defn seq-of-maps [rows cols-per-row output-generator] (let [header-data (map #(str “header-” %) (range cols-per-row)) row-data (seq-of-seqs rows cols-per-row output-generator)] (map #(zipmap header-data %) row-data))) (seq-of-maps 10 4 random-number-between-0-1000) ;; dtype next (library underneath tech.ml.dataset, which is underneath tablecloth) also ;; has a built-in sequence generator: (require’[tech.v3.datatype :as dtype]) (dtype/make-reader :string 4 (str “cell-” idx)) (dtype/make-reader :int32 4 (rand-int 10)) ;; It is lazy, not cached, so be careful about using a computationally-heavy fn for generator ;; ### Generating repeatable sequences of dummy data (def consistent-data (map-indexed (fn [index _coll] (str “cell-” index)) (range 10))) (repeat (zipmap (range 10) consistent-data)) :end
source: book/chapter_2_input_output/2_1_loading_data.clj
+source: book/chapter_2_input_output/2_1_loading_data.clj
6 6
+
+
+
ns chapter-2-input-output.2-2-messy-data
(:nextjournal.clerk/toc true}
{:require [tablecloth.api :as tc]
(:as fun]
[tech.v3.datatype.functional :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -273,12 +273,12 @@ 6
6.1 Multiple types mixed in one column
Tablecloth will handle it just fine, it will just give the column the type :object
-
+
def mixed-types
(:A ["string" "more strings" 3]
(tc/dataset {:B [1 2 "whoops"]}))
-
+
:columns) (tc/info mixed-types
_unnamed :column info [2 4]:
@@ -307,7 +307,7 @@
+
:A :string) (tc/convert-types mixed-types
_unnamed [3 2]:
@@ -333,7 +333,7 @@
+
-> mixed-types
(:A :string)
(tc/convert-types :columns)) (tc/info
@@ -368,18 +368,18 @@
6.2 Multiple formats for a thing that’s supposed to have one (e.g. phone numbers, postal codes)
You can pass any arbitrary function to update a column
-
+
def misformatted
(:phone ["123-456-5654" "(304) 342 1235" "(423)-234-2342" "1234325984" "nope"]
(tc/dataset {:postal-code ["t1n 0k2" "H9Q1L2" "H3H 8V0" "eu5h04" "just wrong"]}))
-
+
require '[clojure.string :as str]) (
nil
-
+
def phone-regex
(re-pattern
(str
@@ -391,7 +391,7 @@ ("(\\d{4})" ; any 4 numbers
)))
-
+
defn- normalize-phone-numbers [col]
(map (fn [v]
(let [[match a b c] (re-matches phone-regex v)]
@@ -403,7 +403,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-phone-numbers
-
+
def postal-code-regex
(re-pattern
(str
@@ -419,7 +419,7 @@ (".*"
"(\\d{1})")))
-
+
defn- normalize-postal-codes [col]
(map (fn [v]
(let [[match a b c d e f] (->> v str/upper-case (re-matches postal-code-regex))]
@@ -431,7 +431,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-postal-codes
-
+
-> misformatted
(:phone normalize-phone-numbers
(tc/update-columns {:postal-code normalize-postal-codes}))
@@ -471,19 +471,19 @@
6.3 Missing values
Tablecloth has many built-in helpers for dealing with missing values.
-
+
require '[tech.v3.datatype.datetime :as dt]) (
nil
-
+
def sparse
(:A [1 2 3 nil nil 6]
(tc/dataset {:B ["test" nil "this" "is" "a" "test"]}))
Drop whole rows with any missing values:
-
+
(tc/drop-missing sparse)
_unnamed [3 2]:
@@ -510,7 +510,7 @@
Drop whole row with any missing values in a given column:
-
+
:A) (tc/drop-missing sparse
_unnamed [4 2]:
@@ -544,12 +544,12 @@
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
-source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
diff --git a/chapter_2_input_output/2_3_exporting_data/index.html b/chapter_2_input_output/2_3_exporting_data/index.html
index abc07b2..a1e1d73 100644
--- a/chapter_2_input_output/2_3_exporting_data/index.html
+++ b/chapter_2_input_output/2_3_exporting_data/index.html
@@ -2,7 +2,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -231,8 +231,7 @@
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
ns chapter-2-input-output.2-2-messy-data
(:nextjournal.clerk/toc true}
{:require [tablecloth.api :as tc]
(:as fun]
[tech.v3.datatype.functional :as kind-clerk])) [scicloj.kind-clerk.api
(kind-clerk/setup!)
6
6.1 Multiple types mixed in one column
Tablecloth will handle it just fine, it will just give the column the type :object
-
+
def mixed-types
(:A ["string" "more strings" 3]
(tc/dataset {:B [1 2 "whoops"]}))
-
+
:columns) (tc/info mixed-types
_unnamed :column info [2 4]:
@@ -307,7 +307,7 @@
+
:A :string) (tc/convert-types mixed-types
_unnamed [3 2]:
@@ -333,7 +333,7 @@
+
-> mixed-types
(:A :string)
(tc/convert-types :columns)) (tc/info
@@ -368,18 +368,18 @@
6.2 Multiple formats for a thing that’s supposed to have one (e.g. phone numbers, postal codes)
You can pass any arbitrary function to update a column
-
+
def misformatted
(:phone ["123-456-5654" "(304) 342 1235" "(423)-234-2342" "1234325984" "nope"]
(tc/dataset {:postal-code ["t1n 0k2" "H9Q1L2" "H3H 8V0" "eu5h04" "just wrong"]}))
-
+
require '[clojure.string :as str]) (
nil
-
+
def phone-regex
(re-pattern
(str
@@ -391,7 +391,7 @@ ("(\\d{4})" ; any 4 numbers
)))
-
+
defn- normalize-phone-numbers [col]
(map (fn [v]
(let [[match a b c] (re-matches phone-regex v)]
@@ -403,7 +403,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-phone-numbers
-
+
def postal-code-regex
(re-pattern
(str
@@ -419,7 +419,7 @@ (".*"
"(\\d{1})")))
-
+
defn- normalize-postal-codes [col]
(map (fn [v]
(let [[match a b c d e f] (->> v str/upper-case (re-matches postal-code-regex))]
@@ -431,7 +431,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-postal-codes
-
+
-> misformatted
(:phone normalize-phone-numbers
(tc/update-columns {:postal-code normalize-postal-codes}))
@@ -471,19 +471,19 @@
6.3 Missing values
Tablecloth has many built-in helpers for dealing with missing values.
-
+
require '[tech.v3.datatype.datetime :as dt]) (
nil
-
+
def sparse
(:A [1 2 3 nil nil 6]
(tc/dataset {:B ["test" nil "this" "is" "a" "test"]}))
Drop whole rows with any missing values:
-
+
(tc/drop-missing sparse)
_unnamed [3 2]:
@@ -510,7 +510,7 @@
Drop whole row with any missing values in a given column:
-
+
:A) (tc/drop-missing sparse
_unnamed [4 2]:
@@ -544,12 +544,12 @@
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
-source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
diff --git a/chapter_2_input_output/2_3_exporting_data/index.html b/chapter_2_input_output/2_3_exporting_data/index.html
index abc07b2..a1e1d73 100644
--- a/chapter_2_input_output/2_3_exporting_data/index.html
+++ b/chapter_2_input_output/2_3_exporting_data/index.html
@@ -2,7 +2,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -231,8 +231,7 @@
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
:object
def mixed-types
(:A ["string" "more strings" 3]
(tc/dataset {:B [1 2 "whoops"]}))
:columns) (tc/info mixed-types
_unnamed :column info [2 4]:
@@ -307,7 +307,7 @@
+
:A :string) (tc/convert-types mixed-types
:A :string) (tc/convert-types mixed-types
_unnamed [3 2]:
@@ -333,7 +333,7 @@
+
-> mixed-types
(:A :string)
(tc/convert-types :columns)) (tc/info
@@ -368,18 +368,18 @@
6.2 Multiple formats for a thing that’s supposed to have one (e.g. phone numbers, postal codes)
You can pass any arbitrary function to update a column
-
+
def misformatted
(:phone ["123-456-5654" "(304) 342 1235" "(423)-234-2342" "1234325984" "nope"]
(tc/dataset {:postal-code ["t1n 0k2" "H9Q1L2" "H3H 8V0" "eu5h04" "just wrong"]}))
-
+
require '[clojure.string :as str]) (
nil
-
+
def phone-regex
(re-pattern
(str
@@ -391,7 +391,7 @@ ("(\\d{4})" ; any 4 numbers
)))
-
+
defn- normalize-phone-numbers [col]
(map (fn [v]
(let [[match a b c] (re-matches phone-regex v)]
@@ -403,7 +403,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-phone-numbers
-
+
def postal-code-regex
(re-pattern
(str
@@ -419,7 +419,7 @@ (".*"
"(\\d{1})")))
-
+
defn- normalize-postal-codes [col]
(map (fn [v]
(let [[match a b c d e f] (->> v str/upper-case (re-matches postal-code-regex))]
@@ -431,7 +431,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-postal-codes
-
+
-> misformatted
(:phone normalize-phone-numbers
(tc/update-columns {:postal-code normalize-postal-codes}))
@@ -471,19 +471,19 @@
6.3 Missing values
Tablecloth has many built-in helpers for dealing with missing values.
-
+
require '[tech.v3.datatype.datetime :as dt]) (
nil
-
+
def sparse
(:A [1 2 3 nil nil 6]
(tc/dataset {:B ["test" nil "this" "is" "a" "test"]}))
Drop whole rows with any missing values:
-
+
(tc/drop-missing sparse)
_unnamed [3 2]:
@@ -510,7 +510,7 @@
Drop whole row with any missing values in a given column:
-
+
:A) (tc/drop-missing sparse
_unnamed [4 2]:
@@ -544,12 +544,12 @@
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
-source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
diff --git a/chapter_2_input_output/2_3_exporting_data/index.html b/chapter_2_input_output/2_3_exporting_data/index.html
index abc07b2..a1e1d73 100644
--- a/chapter_2_input_output/2_3_exporting_data/index.html
+++ b/chapter_2_input_output/2_3_exporting_data/index.html
@@ -2,7 +2,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -231,8 +231,7 @@
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
-> mixed-types
(:A :string)
(tc/convert-types :columns)) (tc/info
6.2 Multiple formats for a thing that’s supposed to have one (e.g. phone numbers, postal codes)
You can pass any arbitrary function to update a column
-def misformatted
(:phone ["123-456-5654" "(304) 342 1235" "(423)-234-2342" "1234325984" "nope"]
(tc/dataset {:postal-code ["t1n 0k2" "H9Q1L2" "H3H 8V0" "eu5h04" "just wrong"]}))
require '[clojure.string :as str]) (
nil
def phone-regex
(re-pattern
(str
@@ -391,7 +391,7 @@ ("(\\d{4})" ; any 4 numbers
)))
defn- normalize-phone-numbers [col]
(map (fn [v]
(let [[match a b c] (re-matches phone-regex v)]
@@ -403,7 +403,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-phone-numbers
def postal-code-regex
(re-pattern
(str
@@ -419,7 +419,7 @@ (".*"
"(\\d{1})")))
defn- normalize-postal-codes [col]
(map (fn [v]
(let [[match a b c d e f] (->> v str/upper-case (re-matches postal-code-regex))]
@@ -431,7 +431,7 @@ (
#'chapter-2-input-output.2-2-messy-data/normalize-postal-codes
-> misformatted
(:phone normalize-phone-numbers
(tc/update-columns {:postal-code normalize-postal-codes}))
6.3 Missing values
Tablecloth has many built-in helpers for dealing with missing values.
-require '[tech.v3.datatype.datetime :as dt]) (
nil
def sparse
(:A [1 2 3 nil nil 6]
(tc/dataset {:B ["test" nil "this" "is" "a" "test"]}))
Drop whole rows with any missing values:
- (tc/drop-missing sparse)
_unnamed [3 2]:
@@ -510,7 +510,7 @@
Drop whole row with any missing values in a given column:
-
+
:A) (tc/drop-missing sparse
_unnamed [4 2]:
@@ -544,12 +544,12 @@
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
-source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
diff --git a/chapter_2_input_output/2_3_exporting_data/index.html b/chapter_2_input_output/2_3_exporting_data/index.html
index abc07b2..a1e1d73 100644
--- a/chapter_2_input_output/2_3_exporting_data/index.html
+++ b/chapter_2_input_output/2_3_exporting_data/index.html
@@ -2,7 +2,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -231,8 +231,7 @@
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
:A) (tc/drop-missing sparse
_unnamed [4 2]:
@@ -544,12 +544,12 @@
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
-source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
diff --git a/chapter_2_input_output/2_3_exporting_data/index.html b/chapter_2_input_output/2_3_exporting_data/index.html
index abc07b2..a1e1d73 100644
--- a/chapter_2_input_output/2_3_exporting_data/index.html
+++ b/chapter_2_input_output/2_3_exporting_data/index.html
@@ -2,7 +2,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -231,8 +231,7 @@
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
6.4 Arbitrary values meant to indicate missing (e.g. “NONE”, “N/A”, false, etc.)
-It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”,false
, etc.
+It’s not uncommon to see missing values indicated in multiple different ways, sometimes even within the same dataset. E.g. missing cells might be blank entirely, or they might be populated with some arbitrary value meant to indicate “nothing”, like “NONE”, “N/A”, false
, etc.
source: book/chapter_2_input_output/2_2_messy_data.clj
+source: book/chapter_2_input_output/2_2_messy_data.clj
7 7
+
+
+
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
-
+
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-
+
let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-
+
def tc-dataset (tc/dataset data)) (
-
+
"data/tc-output.csv") (tc/write-csv! tc-dataset
@@ -307,14 +307,14 @@
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
ns chapter-2-input-output.2-3-exporting-data
(:nextjournal.clerk/toc true}
{:require
@@ -266,24 +266,24 @@ (7 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
(kind-clerk/setup!)
:ok
def consistent-data
(fn [index _coll] (str "cell-" index))
(map-indexed (range 10))) (
def data (take 20 (repeat (zipmap (range 10) consistent-data)))) (
7.1 Writing to a CSV file
depends what the data looks like for a seq of maps: headers are not necessarily sorted, put them in whatever order you want here Clojure maps make no guarantees about key order, make sure to order values, i.e. use the same header row to get the values from each map
-let [headers (-> data first keys sort)
(->> data (map (fn [row]
rows (map (fn [header]
@@ -295,10 +295,10 @@ (nil
Tablecloth can also export csvs (among other formats)
-def tc-dataset (tc/dataset data)) (
"data/tc-output.csv") (tc/write-csv! tc-dataset
7.2 Writing nippy
-
+
"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-
+
"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
7.2 Writing nippy
-"data/tc-nippy.nippy") (tc/write! tc-dataset
nil
Read this also with tablecloth:
-"data/tc-nippy.nippy") (tc/dataset
data/tc-nippy.nippy [20 10]:
@@ -591,14 +591,14 @@
7.3 Leave data in Clojure files
-
+
->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-
+
with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
@@ -808,17 +808,17 @@
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-
+
comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
-More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
+More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
-source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
diff --git a/chapter_3_data_manipulation/3_data_manipulation/index.html b/chapter_3_data_manipulation/3_data_manipulation/index.html
index fb89a1f..a10532b 100644
--- a/chapter_3_data_manipulation/3_data_manipulation/index.html
+++ b/chapter_3_data_manipulation/3_data_manipulation/index.html
@@ -2,7 +2,7 @@
-
+
@@ -64,7 +64,7 @@
-
+
@@ -183,14 +183,14 @@
@@ -204,7 +204,7 @@
Table of contents
- 8.1 Sorting
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions
@@ -236,8 +236,7 @@ 8 8
+
+
+
ns chapter-3-data-manipulation.3-data-manipulation
(;; {:nextjournal.clerk/visibility {:code :hide}
;; :nextjournal.clerk/toc true}
@@ -272,7 +272,7 @@ 8 :as stats]
[fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
@@ -282,7 +282,7 @@ 8
8.1 Sorting
-
+
def dataset (tc/dataset [{:country "Canada"
(:size 10000000}
:country "USA"
@@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-
+
-> dataset
(:country :size])) (tc/reorder-columns [
@@ -323,7 +323,7 @@
8.1.2 Sorting rows
-
+
-> dataset
(:size] [:desc])) (tc/order-by [
@@ -354,7 +354,7 @@
8.1.3 Custom sorting functions
e.g. length of the country name
-
+
-> dataset
(fn [row] (-> row :country count))
(tc/order-by (:desc))
@@ -386,7 +386,7 @@
8.2 Selecting one column or multiple columns
-
+
-> dataset
(:country])) (tc/select-columns [
@@ -412,8 +412,9 @@
8.3 Randomizing order
-
--> dataset tc/shuffle) (
+
+-> dataset
+ ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
8.4 Repeatable randomisation
-
--> dataset (tc/shuffle {:seed 100})) (
+
+-> dataset
+ (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-
+
def dupes (tc/dataset [{:country "Canada"
(:size 10000000}
:country "Canada"
@@ -481,8 +483,9 @@ {:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
-
--> dupes tc/unique-by) (
+
+-> dupes
+ ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
-
--> dupes (tc/unique-by :size)) (
+
+-> dupes
+ (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
-
--> dupes (tc/unique-by :country)) (
+
+-> dupes
+ (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+
+-> dupes
+ (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
-
--> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]
- (case (tdsc/column-name vals)
- (:size (apply max vals)
- :country (last vals)))}))
+
+-> dupes
+ (-> % :country str/lower-case)
+ (tc/unique-by #(:strategy (fn [vals]
+ {case (tdsc/column-name vals)
+ (:size (apply max vals)
+ :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-
+
-> dataset
(:area [9000000 8000000 1000000])) (tc/add-column
@@ -662,7 +670,7 @@
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -684,25 +692,25 @@ (tc/rename-columns {
Canada
10000000
-4.0E+07
+4.0e07
4.00000000
USA
9000000
-1.0E+08
+1.0e08
11.11111111
Germany
80000
-8.0E+07
+8.0e07
1000.00000000
vs, probably preferable
-
+
-> dataset
(:population [40000000 100000000 80000000])
(tc/add-column :size :area})
@@ -743,7 +751,7 @@ (tc/rename-columns {
- Removing columns
-
+
-> dataset
(:size)) (tc/drop-columns
@@ -776,7 +784,7 @@ Filtering rows
- Single filter, multiple filters
-
+
-> dataset
(fn [row]
(tc/select-rows (< 1000000 (:size row))))) (
@@ -803,10 +811,10 @@
- Aggregating rows (counts, groups)
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:average-co2 (fn [ds]
(tc/aggregate {/ (reduce + (get ds "CO2"))
@@ -826,7 +834,7 @@ (
Add a column for year
-
+
-> co2-over-time
("Year" "Date" (memfn getYear))) (tc/map-columns
@@ -976,7 +984,7 @@
Group by year
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))) (.getYear (
@@ -1104,14 +1112,14 @@
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-
+
defn round2
("Round a double to the given precision (number of significant digits)"
[precision d]let [factor (Math/pow 10 precision)]
(/ (Math/round (* d factor)) factor))) (
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-
+
-> co2-over-time
(fn [row]
(tc/group-by (get row "Date"))))
@@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-
+
def ds1 (tc/dataset [{:id "id1" :b "val1"}
(:id "id2" :b "val2"}
{:id "id3" :b "val3"}])) {
-
+
def ds2 (tc/dataset [{:id "id1" :b "val4"}
(:id "id5" :b "val5"}
{:id "id6" :b "val6"}])) {
Naively concats rows
-
+
:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@
-
+
:b "val4" :c "text"}
(tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"}
{:b "val6" :c "test"}])) {
@@ -1430,7 +1438,7 @@
De-duping
-
+
(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@ Merging datasets
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins
-
+
def ds3 (tc/dataset {:id [1 2 3 4]
(:b ["val1" "val2" "val3" "val4"]}))
-
+
def ds4 (tc/dataset {:id [1 2 3 4]
(:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-
+
:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@
“Merge” datasets on a given column where rows have a value
-
+
:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@
Drop rows missing a value
-
+
:id [1 2 3 4]
(tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1597,7 +1605,7 @@ (tc/dataset {
-
+
:id [1 2 3 ]
(tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]})
:id [1 2 3 4]
@@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-
+
:email ["asdf"]
(tc/left-join (tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1698,7 +1706,7 @@
-
+
:email ["asdf"]
(tc/dataset {:name ["asdfads"]
:entry-id [1 2 3]})
@@ -1730,7 +1738,7 @@
-
+
:entry-id [1 2 3]
(tc/dataset {:upload-count [2 3 4]
:catgory ["art" "science"]})
@@ -1763,7 +1771,7 @@
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-
+
:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@ Converting between wide and long formats? Signal processing/time series analysis
- Compute rolling average to be able to plot a trend line
-
+
def exp-moving-avg
(let [data (get co2-over-time "adjusted CO2")
(
@@ -1801,7 +1809,7 @@ moving-avg
- widen dataset to include new row that’s already in order
-
+
(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@
- Rolling average over a 12 point range
-
+
def rolling-average
("Rolling average"
(tc/dataset [[-> co2-over-time
@@ -1961,7 +1969,7 @@ (:relative-window-position :left}))]])) {
fun/mean
-
+
(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@
- Train a model to predict the next 10 years
-
+
-> co2-over-time
( )
@@ -2242,7 +2250,7 @@ Summarizing data (mean, standard deviation, confidence intervals etc.)
- Standard deviation using fastmath
-
+
def avg-co2-by-year
(-> co2-over-time
(fn [row]
@@ -2260,7 +2268,7 @@ (tc/group-by (
- Overall average
-
+
:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@
- Long term average 1991-2020
-
+
-> avg-co2-by-year
(;; (tc/select-rows (fn [row] (< 1990 (:year row))))
;; :average-co2
@@ -2406,12 +2414,12 @@ Run length encoding?
- Filling
nil
s with last non-nil
value?
-
+
def sparse-dataset
(:a [nil 2 3 4 nil nil 7 8]
(tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
-
+
-> sparse-dataset
(:up)) (tc/replace-missing
@@ -2458,7 +2466,7 @@
-
+
-> sparse-dataset
(:updown)) (tc/replace-missing
@@ -2505,7 +2513,7 @@
-
+
-> sparse-dataset
(:down)) (tc/replace-missing
@@ -2552,7 +2560,7 @@
-
+
-> sparse-dataset
(:downup)) (tc/replace-missing
@@ -2599,7 +2607,7 @@
-
+
-> sparse-dataset
(:lerp)) (tc/replace-missing
@@ -2646,7 +2654,7 @@
-
+
-> sparse-dataset
(:all :value 100)) (tc/replace-missing
@@ -2693,7 +2701,7 @@
-
+
-> sparse-dataset
(:a :value 100)) (tc/replace-missing
@@ -2744,7 +2752,7 @@
-source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
@@ -2991,8 +2999,8 @@
diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html
index 07805f3..06e91fd 100644
--- a/chapter_4_data_visualisation/4_2_graphs/index.html
+++ b/chapter_4_data_visualisation/4_2_graphs/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 9 Graphs
+Clojure Data Cookbook - 10 Graphs
-
+
+
-
+
ns chapter-4-data-visualisation.4-2-graphs
(:require [tablecloth.api :as tc]
(:as hc]
@@ -265,16 +264,16 @@ [aerial.hanami.common 9 :as tc]
[tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
+
def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
-
+
-> co2-over-time
(:X "Date"
(vis/hanami-plot ht/line-chart {:XTYPE "temporal"
@@ -283,15 +282,12 @@ 9 :YSCALE {:zero false}}))
-
-vega
-
-
+
def diamonds datasets/diamonds) (
-
+
-> diamonds
(:X :cut
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -299,13 +295,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :color
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -313,13 +306,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
-> diamonds
(:X :clarity
(vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal"
@@ -327,13 +317,10 @@ 9 :WIDTH 750}))
-
-vega
-
-
+
:ok
@@ -343,7 +330,7 @@ 9 book/chapter_4_data_visualisation/4_2_graphs.clj
+source: book/chapter_4_data_visualisation/4_2_graphs.clj
@@ -584,14 +571,11 @@ 9
diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html
index 976c4d0..692688a 100644
--- a/chapter_4_data_visualisation/noj_examples/index.html
+++ b/chapter_4_data_visualisation/noj_examples/index.html
@@ -2,12 +2,12 @@
-
+
-Clojure Data Cookbook - 10 Graphs with Noj
+Clojure Data Cookbook - 9 Graphs with Noj
-
+
+
-
-10.1 Bar graphs
-
+
+9.1 Bar graphs
+
ns chapter-4-data-visualisation.noj-examples
(:require [tablecloth.api :as tc]
(:as hc]
@@ -283,45 +284,37 @@ [aerial.hanami.common :as color]
[clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
-
+
(kind-clerk/setup!)
:ok
-
-10.2 Raw html
-
+
+9.2 Raw html
+
-> "<p>Hello, <i>Noj</i>.</p>"
- ( vis/raw-html)
-
-
-
-
-
-
-
--> [:svg {:height 210
- (:width 500}
- :line {:x1 0
- [:y1 0
- :x2 200
- :y2 200
- :style "stroke:rgb(255,0,0);stroke-width:2"}]]
-
- hiccup/html vis/raw-html)
-
-
-
-
-
-
+ kind/html)
+
+
+Hello, Noj.
+
+
+
+ (kind/html"
+ <svg height=100 width=100>
+<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />
+</svg> ")
+
+
-
-10.3 Visualizing datases with Hanami
+
+9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-
+
def random-walk
(let [n 20]
(-> {:x (range n)
@@ -329,22 +322,19 @@ (+))}
tc/dataset)))
(reductions
-
-10.3.1 A simple plot
+
+9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})) {
-
-vega
-
-
+
Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-
+
-> random-walk
(
(vis/hanami-plot ht/point-chart:MSIZE 200})
@@ -360,14 +350,14 @@ {:height 300,
:data
:values
- {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n",
+ "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n",
:format {:type "csv"}}}
-
-10.3.2 Additional Hanami templates
+
+9.3.2 Additional Hanami templates
The scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.
-
+
-> datasets/mtcars
(
(vis/hanami-plot vht/boxplot-chart:X :gear
@@ -375,15 +365,12 @@ {:Y :mpg}))
-
-vega
-
-
-10.3.3 Layers
-
+
+9.3.3 Layers
+
-> random-walk
(
(vis/hanami-layers:TITLE "points and a line"}
@@ -396,15 +383,12 @@ {:MCOLOR "brown"})]))
-
-vega
-
-
+
-
-10.3.4 Concatenation
-
+
+9.3.4 Concatenation
+
-> random-walk
(
(vis/hanami-vconcat
@@ -421,12 +405,9 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
+
-> random-walk
(
(vis/hanami-hconcat
@@ -443,15 +424,12 @@ {}:WIDTH 100})]))
-
-vega
-
-
+
-
-10.3.5 Linear regression
-
+
+9.3.5 Linear regression
+
-> datasets/mtcars
(:mpg [:wt]
(stats/add-predictions :model-type :smile.regression/ordinary-least-square})
@@ -472,30 +450,24 @@ {:YTITLE :mpg})]))
-
-vega
-
-
+
-
-10.3.6 Histogram
-
+
+9.3.6 Histogram
+
-> datasets/iris
(:sepal-width
(vis/hanami-histogram :nbins 10})) {
-
-vega
-
-
-10.3.7 Combining a few things together
+
+9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -528,13 +500,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
-
+
A similar example with histograms:
-
+
let [pallete (->> :accent
(
color/palettemapv color/format-hex))]
@@ -549,13 +518,10 @@ (nil {}))))
(vis/hanami-vconcat
-
-vega
-
Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-
+
-> datasets/mtcars
(:gear])
(tc/group-by [:mpg [:wt]
@@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
-
-vega
-
-
+
-
+
:bye
@@ -600,7 +563,7 @@ book/chapter_4_data_visualisation/noj_examples.clj
+source: book/chapter_4_data_visualisation/noj_examples.clj
@@ -843,11 +806,14 @@
diff --git a/index.html b/index.html
index a1c59f2..529baf1 100644
--- a/index.html
+++ b/index.html
@@ -2,7 +2,7 @@
-
+
@@ -182,14 +182,14 @@
@@ -203,7 +203,7 @@ Table of contents
- 1 Preface
-
@@ -231,8 +231,7 @@ Clojure Data Cookbook
-
-
-
+
+
+
ns index
(:nextjournal.clerk/visibility {:code :hide}}
{:require
@@ -268,8 +268,6 @@ (1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-
Contents
@@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples
-
+
+
+dev
+
+-
+Dev
+
+
1.2 Recommended sections
-randomizing order
+
-source: book/index.clj
+source: book/index.clj
diff --git a/search.json b/search.json
index c39f026..3826df4 100644
--- a/search.json
+++ b/search.json
@@ -11,7 +11,7 @@
"href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d",
"title": "Clojure Data Cookbook",
"section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D",
- "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples"
+ "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev"
},
{
"objectID": "index.html#recommended-sections",
@@ -200,41 +200,41 @@
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order",
"title": "8 Data manipulation",
"section": "8.3 Randomizing order",
- "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
+ "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000"
},
{
"objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation",
"title": "8 Data manipulation",
"section": "8.4 Repeatable randomisation",
- "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
- },
- {
- "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
- "title": "9 Graphs",
- "section": "",
- "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
+ "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
"href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs",
- "title": "10 Graphs with Noj",
- "section": "10.1 Bar graphs",
- "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
+ "title": "9 Graphs with Noj",
+ "section": "9.1 Bar graphs",
+ "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
"href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html",
- "title": "10 Graphs with Noj",
- "section": "10.2 Raw html",
- "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)"
+ "title": "9 Graphs with Noj",
+ "section": "9.2 Raw html",
+ "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")"
},
{
"objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
"href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami",
- "title": "10 Graphs with Noj",
- "section": "10.3 Visualizing datases with Hanami",
- "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ "title": "9 Graphs with Noj",
+ "section": "9.3 Visualizing datases with Hanami",
+ "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj"
+ },
+ {
+ "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "href": "chapter_4_data_visualisation/4_2_graphs/index.html",
+ "title": "10 Graphs",
+ "section": "",
+ "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj"
}
]
\ No newline at end of file
7.3 Leave data in Clojure files
-->> data pr-str (spit "data/clojure-output.edn")) (
nil
This can be consumed later with:
-with-open [reader (io/reader "data/clojure-output.edn")]
( (edn/read (java.io.PushbackReader. reader)))
7.4 Notebook artifacts
Clerk supports publishing your namespaces as HTML (like this website!) To do that call
-comment
(:paths "path/to/files..."
(clerk/build! {:index "book/index.clj"}))
More information in Clerk’s docs: https://book.clerk.vision/#static-building HTML pages Other formats, options for exporting notebooks? PDFs? Partial artifacts, e.g. export just a graph Writing to a database?
source: book/chapter_2_input_output/2_3_exporting_data.clj
+source: book/chapter_2_input_output/2_3_exporting_data.clj
Table of contents
- 8.1 Sorting
-
-
+
- 8.1.1 Sorting columns
- 8.1.2 Sorting rows
- 8.1.3 Custom sorting functions @@ -236,8 +236,7 @@
- Removing columns -
- Single filter, multiple filters -
- Aggregating rows (counts, groups) -
- When column headers are the same or different, on multiple columns TODO explain set logic and SQL joins -
- Compute rolling average to be able to plot a trend line -
- widen dataset to include new row that’s already in order -
- Rolling average over a 12 point range -
- Train a model to predict the next 10 years -
- Standard deviation using fastmath -
- Overall average -
- Long term average 1991-2020 -
- Filling
nil
s with last non-nil
value?
- - 1 Preface - @@ -231,8 +231,7 @@
- +Dev + +
8 8 + + +
ns chapter-3-data-manipulation.3-data-manipulation (;; {:nextjournal.clerk/visibility {:code :hide} ;; :nextjournal.clerk/toc true} @@ -272,7 +272,7 @@
8 :as stats] [fastmath.stats :as kind-clerk])) [scicloj.kind-clerk.api
+(kind-clerk/setup!)
@@ -282,7 +282,7 @@8
8.1 Sorting
-+def dataset (tc/dataset [{:country "Canada" (:size 10000000} :country "USA" @@ -293,7 +293,7 @@ {
8.1.1 Sorting columns
Give the column headers in the order you want
-+-> dataset (:country :size])) (tc/reorder-columns [
8.1.2 Sorting rows
-+-> dataset (:size] [:desc])) (tc/order-by [
8.1.3 Custom sorting functions
e.g. length of the country name
-+@@ -386,7 +386,7 @@-> dataset (fn [row] (-> row :country count)) (tc/order-by (:desc))
8.2 Selecting one column or multiple columns
-+-> dataset (:country])) (tc/select-columns [
8.3 Randomizing order
--+-> dataset tc/shuffle) (
+-> dataset + ( tc/shuffle)
_unnamed [3 2]:
@@ -441,8 +442,9 @@
-8.4 Repeatable randomisation
--+-> dataset (tc/shuffle {:seed 100})) (
+-> dataset + (:seed 100})) (tc/shuffle {
_unnamed [3 2]:
@@ -468,7 +470,7 @@
Finding unique rows
-+def dupes (tc/dataset [{:country "Canada" (:size 10000000} :country "Canada" @@ -481,8 +483,9 @@ {
:size 80000}]))
(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique
--+-> dupes tc/unique-by) (
+-> dupes + ( tc/unique-by)
_unnamed [5 2]:
@@ -515,8 +518,9 @@
--+-> dupes (tc/unique-by :size)) (
+-> dupes + (:size)) (tc/unique-by
_unnamed [4 2]:
@@ -545,8 +549,9 @@
--+-> dupes (tc/unique-by :country)) (
+-> dupes + (:country)) (tc/unique-by
_unnamed [4 2]:
@@ -575,8 +580,9 @@
--+-> dupes (tc/unique-by #(-> % :country str/lower-case))) (
+-> dupes + (-> % :country str/lower-case))) (tc/unique-by #(
_unnamed [3 2]:
@@ -601,11 +607,13 @@
--+-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals] - (case (tdsc/column-name vals) - (:size (apply max vals) - :country (last vals)))}))
+-> dupes + (-> % :country str/lower-case) + (tc/unique-by #(:strategy (fn [vals] + {case (tdsc/column-name vals) + (:size (apply max vals) + :country (last vals)))}))
_unnamed [3 2]:
@@ -631,7 +639,7 @@
could use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one
-+-> dataset (:area [9000000 8000000 1000000])) (tc/add-column
+-> dataset (:population [40000000 100000000 80000000]) (tc/add-column :size :area}) @@ -684,25 +692,25 @@ (tc/rename-columns {
Canada 10000000 -4.0E+07 +4.0e07 4.00000000 USA 9000000 -1.0E+08 +1.0e08 11.11111111 Germany 80000 -8.0E+07 +8.0e07 1000.00000000 vs, probably preferable
-+-> dataset (:population [40000000 100000000 80000000]) (tc/add-column :size :area}) @@ -743,7 +751,7 @@ (tc/rename-columns {
+-> dataset (:size)) (tc/drop-columns
Filtering rows
+@@ -803,10 +811,10 @@-> dataset (fn [row] (tc/select-rows (< 1000000 (:size row))))) (
+def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
+-> co2-over-time (:average-co2 (fn [ds] (tc/aggregate {/ (reduce + (get ds "CO2")) @@ -826,7 +834,7 @@ (
Add a column for year
-+-> co2-over-time ("Year" "Date" (memfn getYear))) (tc/map-columns
Group by year
-+@@ -1104,14 +1112,14 @@-> co2-over-time (fn [row] (tc/group-by (get row "Date"))))) (.getYear (
Get average temp per year tablecloth applies the aggregate fn to every groups dataset
-+defn round2 ("Round a double to the given precision (number of significant digits)" [precision d]let [factor (Math/pow 10 precision)] (/ (Math/round (* d factor)) factor))) (
+-> co2-over-time (fn [row] (tc/group-by (get row "Date")))) @@ -1220,7 +1228,7 @@ (.getYear (
Can rename the column to be more descriptive
-+-> co2-over-time (fn [row] (tc/group-by (get row "Date")))) @@ -1329,18 +1337,18 @@ (.getYear (
Concatenating datasets
-+def ds1 (tc/dataset [{:id "id1" :b "val1"} (:id "id2" :b "val2"} {:id "id3" :b "val3"}])) {
+def ds2 (tc/dataset [{:id "id1" :b "val4"} (:id "id5" :b "val5"} {:id "id6" :b "val6"}])) {
Naively concats rows
-+:id "id3" :b "other value"}])) (tc/concat ds1 ds2 (tc/dataset [{
_unnamed [7 2]:
@@ -1382,7 +1390,7 @@-
+@@ -1430,7 +1438,7 @@:b "val4" :c "text"} (tc/concat ds1 (tc/dataset [{:b "val5" :c "hi"} {:b "val6" :c "test"}])) {
De-duping
-+(tc/union ds1 ds2)
union [6 2]:
@@ -1472,16 +1480,16 @@Merging datasets
+def ds3 (tc/dataset {:id [1 2 3 4] (:b ["val1" "val2" "val3" "val4"]}))
+def ds4 (tc/dataset {:id [1 2 3 4] (:c ["val1" "val2" "val3" "val4"]}))
Keep all columns
-+:id) (tc/full-join ds3 ds4
full-join [4 4]:
@@ -1522,7 +1530,7 @@“Merge” datasets on a given column where rows have a value
-+:id) (tc/inner-join ds3 ds4
inner-join [4 3]:
@@ -1558,7 +1566,7 @@Drop rows missing a value
-+:id [1 2 3 4] (tc/inner-join (tc/dataset {:b ["val1" "val2" "val3"]}) :id [1 2 3 4] @@ -1597,7 +1605,7 @@ (tc/dataset {
-
+:id [1 2 3 ] (tc/right-join (tc/dataset {:b ["val1" "val2" "val3"]}) :id [1 2 3 4] @@ -1642,7 +1650,7 @@ (tc/dataset {
scratch
-+:email ["asdf"] (tc/left-join (tc/dataset {:name ["asdfads"] :entry-id [1 2 3]}) @@ -1698,7 +1706,7 @@
-
+@@ -1730,7 +1738,7 @@:email ["asdf"] (tc/dataset {:name ["asdfads"] :entry-id [1 2 3]})
-
+@@ -1763,7 +1771,7 @@:entry-id [1 2 3] (tc/dataset {:upload-count [2 3 4] :catgory ["art" "science"]})
see tablecloth join stuff Inner join, only keeps rows with the specified column value in common
-+:id) (tc/inner-join ds1 ds2
inner-join [1 3]:
@@ -1787,7 +1795,7 @@Converting between wide and long formats? Signal processing/time series analysis
+def exp-moving-avg (let [data (get co2-over-time "adjusted CO2") ( @@ -1801,7 +1809,7 @@ moving-avg
+(tc/append co2-over-time exp-moving-avg)
data/co2_over_time.csv [741 4]:
@@ -1952,7 +1960,7 @@+def rolling-average ("Rolling average" (tc/dataset [[-> co2-over-time @@ -1961,7 +1969,7 @@ (
:relative-window-position :left}))]])) {
fun/mean+(tc/append co2-over-time rolling-average)
data/co2_over_time.csv [741 4]:
@@ -2112,7 +2120,7 @@+-> co2-over-time ( )
Summarizing data (mean, standard deviation, confidence intervals etc.)
+def avg-co2-by-year (-> co2-over-time (fn [row] @@ -2260,7 +2268,7 @@ (tc/group-by (
+:average-co2 avg-co2-by-year)) (stats/mean (
@@ -2269,7 +2277,7 @@+-> avg-co2-by-year (;; (tc/select-rows (fn [row] (< 1990 (:year row)))) ;; :average-co2 @@ -2406,12 +2414,12 @@
Run length encoding?
+def sparse-dataset (:a [nil 2 3 4 nil nil 7 8] (tc/dataset {:b [10 11 12 nil nil nil 16 nil]}))
+-> sparse-dataset (:up)) (tc/replace-missing
-
+-> sparse-dataset (:updown)) (tc/replace-missing
-
+-> sparse-dataset (:down)) (tc/replace-missing
-
+-> sparse-dataset (:downup)) (tc/replace-missing
-
+-> sparse-dataset (:lerp)) (tc/replace-missing
-
+-> sparse-dataset (:all :value 100)) (tc/replace-missing
-
+diff --git a/chapter_4_data_visualisation/4_2_graphs/index.html b/chapter_4_data_visualisation/4_2_graphs/index.html index 07805f3..06e91fd 100644 --- a/chapter_4_data_visualisation/4_2_graphs/index.html +++ b/chapter_4_data_visualisation/4_2_graphs/index.html @@ -2,12 +2,12 @@ - + --> sparse-dataset (:a :value 100)) (tc/replace-missing
-@@ -2991,8 +2999,8 @@source: book/chapter_3_data_manipulation/3_data_manipulation.clj
+source: book/chapter_3_data_manipulation/3_data_manipulation.clj
Clojure Data Cookbook - 9 Graphs +Clojure Data Cookbook - 10 Graphs - + + -+ns chapter-4-data-visualisation.4-2-graphs (:require [tablecloth.api :as tc] (:as hc] @@ -265,16 +264,16 @@ [aerial.hanami.common
9 :as tc] [tablecloth.api :as kind-clerk])) [scicloj.kind-clerk.api
+(kind-clerk/setup!)
:ok
+def co2-over-time (tc/dataset "data/co2_over_time.csv")) (
+-> co2-over-time (:X "Date" (vis/hanami-plot ht/line-chart {:XTYPE "temporal" @@ -283,15 +282,12 @@
9 :YSCALE {:zero false}}))
---vega-+def diamonds datasets/diamonds) (
+-> diamonds (:X :cut (vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal" @@ -299,13 +295,10 @@
9 :WIDTH 750}))
---vega-+-> diamonds (:X :color (vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal" @@ -313,13 +306,10 @@
9 :WIDTH 750}))
---vega-+-> diamonds (:X :clarity (vis/hanami-plot vht/boxplot-chart {:XTYPE "nominal" @@ -327,13 +317,10 @@
9 :WIDTH 750}))
---vega-+diff --git a/chapter_4_data_visualisation/noj_examples/index.html b/chapter_4_data_visualisation/noj_examples/index.html index 976c4d0..692688a 100644 --- a/chapter_4_data_visualisation/noj_examples/index.html +++ b/chapter_4_data_visualisation/noj_examples/index.html @@ -2,12 +2,12 @@ - + -:ok
@@ -343,7 +330,7 @@@@ -584,14 +571,11 @@9 book/chapter_4_data_visualisation/4_2_graphs.clj +
source: book/chapter_4_data_visualisation/4_2_graphs.clj
9
Clojure Data Cookbook - 10 Graphs with Noj +Clojure Data Cookbook - 9 Graphs with Noj - + + -- 10.1 Bar graphs
-++ -9.1 Bar graphs
+ns chapter-4-data-visualisation.noj-examples (:require [tablecloth.api :as tc] (:as hc] @@ -283,45 +284,37 @@ [aerial.hanami.common
:as color] [clojure2d.color :as kind-clerk])) [scicloj.kind-clerk.api
++(kind-clerk/setup!)
:ok
- 10.2 Raw html
-+++ 9.2 Raw html
+--> "<p>Hello, <i>Noj</i>.</p>" - ( vis/raw-html)
--- ------> [:svg {:height 210 - (:width 500} - :line {:x1 0 - [:y1 0 - :x2 200 - :y2 200 - :style "stroke:rgb(255,0,0);stroke-width:2"}]] - - hiccup/html vis/raw-html)
-+ kind/html)- --+Hello, Noj. +
+++++ (kind/html" + <svg height=100 width=100> +<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' /> +</svg> ")
- 10.3 Visualizing datases with Hanami
++ 9.3 Visualizing datases with Hanami
Noj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.
-+def random-walk (let [n 20] (-> {:x (range n) @@ -329,22 +322,19 @@ (
+))} tc/dataset)))
(reductions- 10.3.1 A simple plot
++ 9.3.1 A simple plot
We can plot a Tablecloth datasete using a Hanami template:
-+-> random-walk ( (vis/hanami-plot ht/point-chart:MSIZE 200})) {
--vega - -+Let us look inside the resulting vega-lite space. We can see the dataset is included as CSV:
-+-> random-walk ( (vis/hanami-plot ht/point-chart:MSIZE 200}) @@ -360,14 +350,14 @@ {
:height 300, :data :values - {"x,y\n0,0.2696595674516514\n1,0.5994221672898448\n2,0.9041662987177651\n3,1.1641703504999699\n4,1.606396428799537\n5,1.3972382302814177\n6,1.7686488303622263\n7,1.8812856284088362\n8,2.1521859934642023\n9,1.761413935660772\n10,1.5350565538499519\n11,1.4760599735629056\n12,1.2326873858637482\n13,1.2742130826088063\n14,0.9937616484523007\n15,1.4130287588308725\n16,1.16480354577581\n17,0.6889384877674767\n18,0.821314858587385\n19,0.7473480777397288\n", + "x,y\n0,0.25915143611932323\n1,0.07679044186868467\n2,-0.16838373926426764\n3,-0.3472917379109737\n4,-0.4185674782284593\n5,-0.3275712090765166\n6,0.06499031613330208\n7,-0.12473464521100663\n8,0.24581959605889236\n9,0.3872343668945971\n10,0.20630731645770806\n11,0.4283007097190942\n12,0.8577253018355132\n13,1.029799282228336\n14,1.500296189747702\n15,1.802090709990422\n16,1.675173594897049\n17,1.5406670970402527\n18,1.5912246361060238\n19,1.7546356050436023\n", :format {:type "csv"}}}
- 10.3.2 Additional Hanami templates
++ 9.3.2 Additional Hanami templates
The
-scicloj.noj.v1.vis.hanami.templates
namespace add Hanami templates to Hanami’s own collection.+-> datasets/mtcars ( (vis/hanami-plot vht/boxplot-chart:X :gear @@ -375,15 +365,12 @@ {
:Y :mpg}))
---vega-- 10.3.3 Layers
-++ -9.3.3 Layers
+-> random-walk ( (vis/hanami-layers:TITLE "points and a line"} @@ -396,15 +383,12 @@ {
:MCOLOR "brown"})]))
--vega - -+- 10.3.4 Concatenation
-++ 9.3.4 Concatenation
+-> random-walk ( (vis/hanami-vconcat @@ -421,12 +405,9 @@ {}
:WIDTH 100})]))
---vega - -++-> random-walk ( (vis/hanami-hconcat @@ -443,15 +424,12 @@ {}
:WIDTH 100})]))
---vega - -+- 10.3.5 Linear regression
-++ -9.3.5 Linear regression
+-> datasets/mtcars (:mpg [:wt] (stats/add-predictions :model-type :smile.regression/ordinary-least-square}) @@ -472,30 +450,24 @@ {
:YTITLE :mpg})])) --vega - -+- 10.3.6 Histogram
-++ -9.3.6 Histogram
+-> datasets/iris (:sepal-width (vis/hanami-histogram :nbins 10})) {
--vega-- 10.3.7 Combining a few things together
++ 9.3.7 Combining a few things together
The following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.
-+let [pallete (->> :accent ( color/palettemapv color/format-hex))] @@ -528,13 +500,10 @@ (
nil {}))))
(vis/hanami-vconcat--vega - -+A similar example with histograms:
-+let [pallete (->> :accent ( color/palettemapv color/format-hex))] @@ -549,13 +518,10 @@ (
nil {}))))
(vis/hanami-vconcat--vega-Scatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).
-+-> datasets/mtcars (:gear]) (tc/group-by [:mpg [:wt] @@ -585,12 +551,9 @@ (stats/add-predictions
kind/vega-lite)
---vega - -++diff --git a/index.html b/index.html index a1c59f2..529baf1 100644 --- a/index.html +++ b/index.html @@ -2,7 +2,7 @@ - + @@ -182,14 +182,14 @@ @@ -203,7 +203,7 @@:bye
@@ -600,7 +563,7 @@@@ -843,11 +806,14 @@book/chapter_4_data_visualisation/noj_examples.clj +
source: book/chapter_4_data_visualisation/noj_examples.clj
Table of contents
Clojure Data Cookbook
- - -+ + +ns index (:nextjournal.clerk/visibility {:code :hide}} {:require @@ -268,8 +268,6 @@ (
1 Preface
Welcome to the Clojure Data Cookbook! This is the website for the work-in-progress that will become the Clojure Data Cookbook. The goal is to provide a reference for anyone who has data to work with and an interest in doing it in Clojure, documenting the current community recommendations and default stack for data science in Clojure.
1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D
-
-Contents @@ -321,17 +319,24 @@
Chapter_4_data_visualisation/noj_examples -
+ +dev +
+1.2 Recommended sections
-randomizing order +-diff --git a/search.json b/search.json index c39f026..3826df4 100644 --- a/search.json +++ b/search.json @@ -11,7 +11,7 @@ "href": "index.html#note-all-work-here-is-in-progress-subject-to-change-very-messy-and-partially-done.-please-bear-with-me-as-i-work-on-through-this-project-d", "title": "Clojure Data Cookbook", "section": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D", - "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples" + "text": "1.1 Note! all work here is in progress, subject to change, very messy, and partially done. Please bear with me as I work on through this project :D\n\n\nContents\n\n\n\nchapter_1_intro\n\n\nChapter_1_intro/1_1_welcome.html\n\n\nChapter_1_intro/1_2_why_clojure.html\n\n\nChapter_1_intro/1_3_set_up.html\n\n\n\n\nchapter_2_input_output\n\n\nChapter_2_input_output/2_1_loading_data\n\n\nChapter_2_input_output/2_2_messy_data\n\n\nChapter_2_input_output/2_3_exporting_data\n\n\n\n\nchapter_3_data_manipulation\n\n\nChapter_3_data_manipulation/3_data_manipulation\n\n\n\n\nchapter_4_data_visualisation\n\n\nChapter_4_data_visualisation/4_2_graphs\n\n\nChapter_4_data_visualisation/noj_examples\n\n\n\n\ndev\n\n\nDev" }, { "objectID": "index.html#recommended-sections", @@ -200,41 +200,41 @@ "href": "chapter_3_data_manipulation/3_data_manipulation/index.html#randomizing-order", "title": "8 Data manipulation", "section": "8.3 Randomizing order", - "text": "8.3 Randomizing order\n\n(-> dataset tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000" + "text": "8.3 Randomizing order\n\n(-> dataset\n tc/shuffle)\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nUSA\n9000000\n\n\nCanada\n10000000\n\n\nGermany\n80000" }, { "objectID": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation", "href": "chapter_3_data_manipulation/3_data_manipulation/index.html#repeatable-randomisation", "title": "8 Data manipulation", "section": "8.4 Repeatable randomisation", - "text": "8.4 Repeatable randomisation\n\n(-> dataset (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes (tc/unique-by #(-> % :country str/lower-case) {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0E+07\n4.00000000\n\n\nUSA\n9000000\n1.0E+08\n11.11111111\n\n\nGermany\n80000\n8.0E+07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj" - }, - { - "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html", - "href": "chapter_4_data_visualisation/4_2_graphs/index.html", - "title": "9 Graphs", - "section": "", - "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\nvega\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\nvega\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj" + "text": "8.4 Repeatable randomisation\n\n(-> dataset\n (tc/shuffle {:seed 100}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nGermany\n80000\n\n\nUSA\n9000000\n\n\n\nFinding unique rows\n\n(def dupes (tc/dataset [{:country \"Canada\"\n :size 10000000}\n {:country \"Canada\"\n :size 10000303}\n {:country \"United states\"\n :size 9000000}\n {:country \"United States\"\n :size 9000000}\n {:country \"Germany\"\n :size 80000}]))\n\n(def “USA” #{“USA” “United States” “United states of America”}) https://scicloj.github.io/tablecloth/index.html#Unique\n\n(-> dupes\n tc/unique-by)\n\n_unnamed [5 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :size))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nCanada\n10000303\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by :country))\n\n_unnamed [4 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUnited states\n9000000\n\n\nGermany\n80000\n\n\n\n\n(-> dupes\n (tc/unique-by #(-> % :country str/lower-case)\n {:strategy (fn [vals]\n (case (tdsc/column-name vals)\n :size (apply max vals)\n :country (last vals)))}))\n\n_unnamed [3 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000303\n\n\nUnited States\n9000000\n\n\nGermany\n80000\n\n\n\ncould use this to rename vals to a canonical one (e.g. convert everything that matches set of USA to “USA”) Adding computed columns to data “lengthening” or “widening” data, making it “tidy” e.g. converting a column with numbers to a category (>5 “yes”, <5 “no”), summing multiple columns into a new one\n\n(-> dataset\n (tc/add-column :area [9000000 8000000 1000000]))\n\n_unnamed [3 3]:\n\n\n\n:country\n:size\n:area\n\n\n\n\nCanada\n10000000\n9000000\n\n\nUSA\n9000000\n8000000\n\n\nGermany\n80000\n1000000\n\n\n\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/convert-types :population :double)\n (tc/add-column :density (fn [d]\n (fun// (:population d) (:area d)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n4.0e07\n4.00000000\n\n\nUSA\n9000000\n1.0e08\n11.11111111\n\n\nGermany\n80000\n8.0e07\n1000.00000000\n\n\n\nvs, probably preferable\n\n(-> dataset\n (tc/add-column :population [40000000 100000000 80000000])\n (tc/rename-columns {:size :area})\n (tc/add-column :density (fn [ds]\n (fun// (fun/* 1.0 (:population ds)) (:area ds)))))\n\n_unnamed [3 4]:\n\n\n\n:country\n:area\n:population\n:density\n\n\n\n\nCanada\n10000000\n40000000\n4.00000000\n\n\nUSA\n9000000\n100000000\n11.11111111\n\n\nGermany\n80000\n80000000\n1000.00000000\n\n\n\n\nRemoving columns\n\n\n(-> dataset\n (tc/drop-columns :size))\n\n_unnamed [3 1]:\n\n\n\n:country\n\n\n\n\nCanada\n\n\nUSA\n\n\nGermany\n\n\n\n\nTransforming values\nWorking with nested data structures, really nice libraries in Clojure for doing this (specter, meander)\nAll values in a column\nConditional transformation (e.g. “truncate only 11 digit phone numbers to 10 digits”)\nRearranging order of columns\nRenaming columns\nFiltering rows\nSingle filter, multiple filters\n\n\n(-> dataset\n (tc/select-rows (fn [row]\n (< 1000000 (:size row)))))\n\n_unnamed [2 2]:\n\n\n\n:country\n:size\n\n\n\n\nCanada\n10000000\n\n\nUSA\n9000000\n\n\n\n\nAggregating rows (counts, groups)\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))}))\n\n_unnamed [1 1]:\n\n\n\n:average-co2\n\n\n\n\n355.31093117\n\n\n\nAdd a column for year\n\n(-> co2-over-time\n (tc/map-columns \"Year\" \"Date\" (memfn getYear)))\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nYear\n\n\n\n\n1958-03-01\n315.70\n314.44\n1958\n\n\n1958-04-01\n317.46\n315.16\n1958\n\n\n1958-05-01\n317.51\n314.71\n1958\n\n\n1958-07-01\n315.86\n315.19\n1958\n\n\n1958-08-01\n314.93\n316.19\n1958\n\n\n1958-09-01\n313.21\n316.08\n1958\n\n\n1958-11-01\n313.33\n315.20\n1958\n\n\n1958-12-01\n314.67\n315.43\n1958\n\n\n1959-01-01\n315.58\n315.54\n1959\n\n\n1959-02-01\n316.49\n315.86\n1959\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n2019\n\n\n2019-07-01\n411.85\n411.03\n2019\n\n\n2019-08-01\n410.08\n411.62\n2019\n\n\n2019-09-01\n408.55\n412.06\n2019\n\n\n2019-10-01\n408.43\n412.06\n2019\n\n\n2019-11-01\n410.29\n412.56\n2019\n\n\n2019-12-01\n411.85\n412.78\n2019\n\n\n2020-01-01\n413.37\n413.32\n2020\n\n\n2020-02-01\n414.09\n413.33\n2020\n\n\n2020-03-01\n414.51\n412.94\n2020\n\n\n2020-04-01\n416.18\n413.35\n2020\n\n\n\nGroup by year\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\")))))\n\n_unnamed [63 3]:\n\n\n\n:name\n:group-id\n:data\n\n\n\n\n1958\n0\nGroup: 1958 [8 3]:\n\n\n1959\n1\nGroup: 1959 [12 3]:\n\n\n1960\n2\nGroup: 1960 [12 3]:\n\n\n1961\n3\nGroup: 1961 [12 3]:\n\n\n1962\n4\nGroup: 1962 [12 3]:\n\n\n1963\n5\nGroup: 1963 [12 3]:\n\n\n1964\n6\nGroup: 1964 [9 3]:\n\n\n1965\n7\nGroup: 1965 [12 3]:\n\n\n1966\n8\nGroup: 1966 [12 3]:\n\n\n1967\n9\nGroup: 1967 [12 3]:\n\n\n…\n…\n…\n\n\n2010\n52\nGroup: 2010 [12 3]:\n\n\n2011\n53\nGroup: 2011 [12 3]:\n\n\n2012\n54\nGroup: 2012 [12 3]:\n\n\n2013\n55\nGroup: 2013 [12 3]:\n\n\n2014\n56\nGroup: 2014 [12 3]:\n\n\n2015\n57\nGroup: 2015 [12 3]:\n\n\n2016\n58\nGroup: 2016 [12 3]:\n\n\n2017\n59\nGroup: 2017 [12 3]:\n\n\n2018\n60\nGroup: 2018 [12 3]:\n\n\n2019\n61\nGroup: 2019 [12 3]:\n\n\n2020\n62\nGroup: 2020 [4 3]:\n\n\n\nGet average temp per year tablecloth applies the aggregate fn to every groups dataset\n\n(defn round2\n \"Round a double to the given precision (number of significant digits)\"\n [precision d]\n (let [factor (Math/pow 10 precision)]\n (/ (Math/round (* d factor)) factor)))\n\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (round2 2\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\")))))}))\n\n_unnamed [63 2]:\n\n\n\n:$group-name\n:average-co2\n\n\n\n\n1958\n315.33\n\n\n1959\n315.98\n\n\n1960\n316.91\n\n\n1961\n317.65\n\n\n1962\n318.45\n\n\n1963\n318.99\n\n\n1964\n319.20\n\n\n1965\n320.04\n\n\n1966\n321.37\n\n\n1967\n322.18\n\n\n…\n…\n\n\n2010\n389.90\n\n\n2011\n391.65\n\n\n2012\n393.87\n\n\n2013\n396.57\n\n\n2014\n398.61\n\n\n2015\n400.89\n\n\n2016\n404.28\n\n\n2017\n406.58\n\n\n2018\n408.59\n\n\n2019\n411.50\n\n\n2020\n414.54\n\n\n\nCan rename the column to be more descriptive\n\n(-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (/ (reduce + (get ds \"CO2\"))\n (count (get ds \"CO2\"))))})\n (tc/rename-columns {:$group-name :year}))\n\n_unnamed [63 2]:\n\n\n\n:year\n:average-co2\n\n\n\n\n1958\n315.33375000\n\n\n1959\n315.98166667\n\n\n1960\n316.90916667\n\n\n1961\n317.64500000\n\n\n1962\n318.45416667\n\n\n1963\n318.99250000\n\n\n1964\n319.20111111\n\n\n1965\n320.03583333\n\n\n1966\n321.36916667\n\n\n1967\n322.18083333\n\n\n…\n…\n\n\n2010\n389.90083333\n\n\n2011\n391.64833333\n\n\n2012\n393.87000000\n\n\n2013\n396.56666667\n\n\n2014\n398.61416667\n\n\n2015\n400.88500000\n\n\n2016\n404.27750000\n\n\n2017\n406.58416667\n\n\n2018\n408.58750000\n\n\n2019\n411.49500000\n\n\n2020\n414.53750000\n\n\n\nConcatenating datasets\n\n(def ds1 (tc/dataset [{:id \"id1\" :b \"val1\"}\n {:id \"id2\" :b \"val2\"}\n {:id \"id3\" :b \"val3\"}]))\n\n\n(def ds2 (tc/dataset [{:id \"id1\" :b \"val4\"}\n {:id \"id5\" :b \"val5\"}\n {:id \"id6\" :b \"val6\"}]))\n\nNaively concats rows\n\n(tc/concat ds1 ds2 (tc/dataset [{:id \"id3\" :b \"other value\"}]))\n\n_unnamed [7 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\nid3\nother value\n\n\n\n\n(tc/concat ds1 (tc/dataset [{:b \"val4\" :c \"text\"}\n {:b \"val5\" :c \"hi\"}\n {:b \"val6\" :c \"test\"}]))\n\n_unnamed [6 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\nid1\nval1\n\n\n\nid2\nval2\n\n\n\nid3\nval3\n\n\n\n\nval4\ntext\n\n\n\nval5\nhi\n\n\n\nval6\ntest\n\n\n\nDe-duping\n\n(tc/union ds1 ds2)\n\nunion [6 2]:\n\n\n\n:id\n:b\n\n\n\n\nid1\nval1\n\n\nid2\nval2\n\n\nid3\nval3\n\n\nid1\nval4\n\n\nid5\nval5\n\n\nid6\nval6\n\n\n\n\nMerging datasets\nWhen column headers are the same or different, on multiple columns TODO explain set logic and SQL joins\n\n\n(def ds3 (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\n\n(def ds4 (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]}))\n\nKeep all columns\n\n(tc/full-join ds3 ds4 :id)\n\nfull-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n4\nval4\n4\nval4\n\n\n\n“Merge” datasets on a given column where rows have a value\n\n(tc/inner-join ds3 ds4 :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\nval4\nval4\n\n\n\nDrop rows missing a value\n\n(tc/inner-join (tc/dataset {:id [1 2 3 4]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\ninner-join [4 3]:\n\n\n\n:id\n:b\n:c\n\n\n\n\n1\nval1\nval1\n\n\n2\nval2\nval2\n\n\n3\nval3\nval3\n\n\n4\n\nval4\n\n\n\n\n(tc/right-join (tc/dataset {:id [1 2 3 ]\n :b [\"val1\" \"val2\" \"val3\"]})\n (tc/dataset {:id [1 2 3 4]\n :c [\"val1\" \"val2\" \"val3\" \"val4\"]})\n :id)\n\nright-outer-join [4 4]:\n\n\n\n:id\n:b\n:right.id\n:c\n\n\n\n\n1\nval1\n1\nval1\n\n\n2\nval2\n2\nval2\n\n\n3\nval3\n3\nval3\n\n\n\n\n4\nval4\n\n\n\nscratch\n\n(tc/left-join (tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n (tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n :entry-id)\n\nleft-outer-join [3 6]:\n\n\n\n\n\n\n\n\n\n\n\n:entry-id\n:email\n:name\n:right.entry-id\n:upload-count\n:catgory\n\n\n\n\n1\nasdf\nasdfads\n1\n2\nart\n\n\n2\n\n\n2\n3\nscience\n\n\n3\n\n\n3\n4\n\n\n\n\n\n(tc/dataset {:email [\"asdf\"]\n :name [\"asdfads\"]\n :entry-id [1 2 3]})\n\n_unnamed [3 3]:\n\n\n\n:email\n:name\n:entry-id\n\n\n\n\nasdf\nasdfads\n1\n\n\n\n\n2\n\n\n\n\n3\n\n\n\n\n(tc/dataset {:entry-id [1 2 3]\n :upload-count [2 3 4]\n :catgory [\"art\" \"science\"]})\n\n_unnamed [3 3]:\n\n\n\n:entry-id\n:upload-count\n:catgory\n\n\n\n\n1\n2\nart\n\n\n2\n3\nscience\n\n\n3\n4\n\n\n\n\nsee tablecloth join stuff Inner join, only keeps rows with the specified column value in common\n\n(tc/inner-join ds1 ds2 :id)\n\ninner-join [1 3]:\n\n\n\n:id\n:b\n:right.b\n\n\n\n\nid1\nval1\nval4\n\n\n\n\nConverting between wide and long formats? Signal processing/time series analysis\nCompute rolling average to be able to plot a trend line\n\n\n(def exp-moving-avg\n (let [data (get co2-over-time \"adjusted CO2\")\n moving-avg\n (->> data\n (reduce (fn [acc next]\n (conj acc (+ (* 0.9 (last acc)) (* 0.1 next))))\n [(first data)])\n rest)]\n (tc/dataset [[\"Exponential moving average\" moving-avg]])))\n\n\nwiden dataset to include new row that’s already in order\n\n\n(tc/append co2-over-time exp-moving-avg)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nExponential moving average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.51200000\n\n\n1958-05-01\n317.51\n314.71\n314.53180000\n\n\n1958-07-01\n315.86\n315.19\n314.59762000\n\n\n1958-08-01\n314.93\n316.19\n314.75685800\n\n\n1958-09-01\n313.21\n316.08\n314.88917220\n\n\n1958-11-01\n313.33\n315.20\n314.92025498\n\n\n1958-12-01\n314.67\n315.43\n314.97122948\n\n\n1959-01-01\n315.58\n315.54\n315.02810653\n\n\n1959-02-01\n316.49\n315.86\n315.11129588\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n409.42307506\n\n\n2019-07-01\n411.85\n411.03\n409.58376755\n\n\n2019-08-01\n410.08\n411.62\n409.78739079\n\n\n2019-09-01\n408.55\n412.06\n410.01465172\n\n\n2019-10-01\n408.43\n412.06\n410.21918654\n\n\n2019-11-01\n410.29\n412.56\n410.45326789\n\n\n2019-12-01\n411.85\n412.78\n410.68594110\n\n\n2020-01-01\n413.37\n413.32\n410.94934699\n\n\n2020-02-01\n414.09\n413.33\n411.18741229\n\n\n2020-03-01\n414.51\n412.94\n411.36267106\n\n\n2020-04-01\n416.18\n413.35\n411.56140396\n\n\n\n\nRolling average over a 12 point range\n\n\n(def rolling-average\n (tc/dataset [[\"Rolling average\"\n (-> co2-over-time\n (get \"adjusted CO2\")\n (rolling/fixed-rolling-window 12\n fun/mean\n {:relative-window-position :left}))]]))\n\n\n(tc/append co2-over-time rolling-average)\n\ndata/co2_over_time.csv [741 4]:\n\n\n\nDate\nCO2\nadjusted CO2\nRolling average\n\n\n\n\n1958-03-01\n315.70\n314.44\n314.44000000\n\n\n1958-04-01\n317.46\n315.16\n314.50000000\n\n\n1958-05-01\n317.51\n314.71\n314.52250000\n\n\n1958-07-01\n315.86\n315.19\n314.58500000\n\n\n1958-08-01\n314.93\n316.19\n314.73083333\n\n\n1958-09-01\n313.21\n316.08\n314.86750000\n\n\n1958-11-01\n313.33\n315.20\n314.93083333\n\n\n1958-12-01\n314.67\n315.43\n315.01333333\n\n\n1959-01-01\n315.58\n315.54\n315.10500000\n\n\n1959-02-01\n316.49\n315.86\n315.22333333\n\n\n…\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n410.14000000\n\n\n2019-07-01\n411.85\n411.03\n410.38583333\n\n\n2019-08-01\n410.08\n411.62\n410.63500000\n\n\n2019-09-01\n408.55\n412.06\n410.88333333\n\n\n2019-10-01\n408.43\n412.06\n411.08750000\n\n\n2019-11-01\n410.29\n412.56\n411.26916667\n\n\n2019-12-01\n411.85\n412.78\n411.48833333\n\n\n2020-01-01\n413.37\n413.32\n411.69250000\n\n\n2020-02-01\n414.09\n413.33\n411.89500000\n\n\n2020-03-01\n414.51\n412.94\n412.10166667\n\n\n2020-04-01\n416.18\n413.35\n412.32083333\n\n\n\n\nTrain a model to predict the next 10 years\n\n\n(-> co2-over-time\n )\n\ndata/co2_over_time.csv [741 3]:\n\n\n\nDate\nCO2\nadjusted CO2\n\n\n\n\n1958-03-01\n315.70\n314.44\n\n\n1958-04-01\n317.46\n315.16\n\n\n1958-05-01\n317.51\n314.71\n\n\n1958-07-01\n315.86\n315.19\n\n\n1958-08-01\n314.93\n316.19\n\n\n1958-09-01\n313.21\n316.08\n\n\n1958-11-01\n313.33\n315.20\n\n\n1958-12-01\n314.67\n315.43\n\n\n1959-01-01\n315.58\n315.54\n\n\n1959-02-01\n316.49\n315.86\n\n\n…\n…\n…\n\n\n2019-06-01\n413.96\n411.38\n\n\n2019-07-01\n411.85\n411.03\n\n\n2019-08-01\n410.08\n411.62\n\n\n2019-09-01\n408.55\n412.06\n\n\n2019-10-01\n408.43\n412.06\n\n\n2019-11-01\n410.29\n412.56\n\n\n2019-12-01\n411.85\n412.78\n\n\n2020-01-01\n413.37\n413.32\n\n\n2020-02-01\n414.09\n413.33\n\n\n2020-03-01\n414.51\n412.94\n\n\n2020-04-01\n416.18\n413.35\n\n\n\n\nSummarizing data (mean, standard deviation, confidence intervals etc.)\nStandard deviation using fastmath\n\n\n(def avg-co2-by-year\n (-> co2-over-time\n (tc/group-by (fn [row]\n (.getYear (get row \"Date\"))))\n (tc/aggregate {:average-co2 (fn [ds]\n (stats/mean (get ds \"adjusted CO2\"))\n ;; (/ (reduce + (get ds \"CO2\"))\n ;; (count (get ds \"CO2\")))\n )\n :standard-deviation (fn [ds]\n (stats/stddev (get ds \"adjusted CO2\")))})\n ;; (tc/rename-columns {:$group-name :year})\n ))\n\n\nOverall average\n\n\n(stats/mean (:average-co2 avg-co2-by-year))\n\n\n355.56414902998233\n\n\nLong term average 1991-2020\n\n\n(-> avg-co2-by-year\n ;; (tc/select-rows (fn [row] (< 1990 (:year row))))\n ;; :average-co2\n ;; mean\n )\n\n_unnamed [63 3]:\n\n\n\n:$group-name\n:average-co2\n:standard-deviation\n\n\n\n\n1958\n315.30000000\n0.60318204\n\n\n1959\n315.97750000\n0.47259679\n\n\n1960\n316.90750000\n0.42004599\n\n\n1961\n317.63833333\n0.45170049\n\n\n1962\n318.44833333\n0.37201743\n\n\n1963\n318.98750000\n0.28813270\n\n\n1964\n319.67888889\n0.20127372\n\n\n1965\n320.03083333\n0.50883929\n\n\n1966\n321.36250000\n0.37363388\n\n\n1967\n322.17500000\n0.32326460\n\n\n…\n…\n…\n\n\n2010\n389.89333333\n0.67686891\n\n\n2011\n391.64500000\n0.71908401\n\n\n2012\n393.86500000\n0.87383689\n\n\n2013\n396.55833333\n0.72002315\n\n\n2014\n398.60500000\n0.68076828\n\n\n2015\n400.87833333\n1.02130784\n\n\n2016\n404.27416667\n0.95601881\n\n\n2017\n406.57750000\n0.64441834\n\n\n2018\n408.58166667\n0.99862481\n\n\n2019\n411.48833333\n0.74410206\n\n\n2020\n413.23500000\n0.19706175\n\n\n\n\nWorking with sequential data\nSmoothing out data\nCalculating a moving average\nAveraging a sequence in blocks\nRun length encoding?\nFilling nil s with last non-nil value?\n\n\n(def sparse-dataset\n (tc/dataset {:a [nil 2 3 4 nil nil 7 8]\n :b [10 11 12 nil nil nil 16 nil]}))\n\n\n(-> sparse-dataset\n (tc/replace-missing :up))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :updown))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n16\n\n\n7\n16\n\n\n7\n16\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :down))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :downup))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n12\n\n\n4\n12\n\n\n4\n12\n\n\n7\n16\n\n\n8\n16\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :lerp))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n2.0\n10.0\n\n\n2.0\n11.0\n\n\n3.0\n12.0\n\n\n4.0\n13.0\n\n\n5.0\n14.0\n\n\n6.0\n15.0\n\n\n7.0\n16.0\n\n\n8.0\n16.0\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :all :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n100\n\n\n100\n100\n\n\n100\n100\n\n\n7\n16\n\n\n8\n100\n\n\n\n\n(-> sparse-dataset\n (tc/replace-missing :a :value 100))\n\n_unnamed [8 2]:\n\n\n\n:a\n:b\n\n\n\n\n100\n10\n\n\n2\n11\n\n\n3\n12\n\n\n4\n\n\n\n100\n\n\n\n100\n\n\n\n7\n16\n\n\n8\n\n\n\n\n\n\n\n\nsource: book/chapter_3_data_manipulation/3_data_manipulation.clj" }, { "objectID": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs", "href": "chapter_4_data_visualisation/noj_examples/index.html#bar-graphs", - "title": "10 Graphs with Noj", - "section": "10.1 Bar graphs", - "text": "10.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok" + "title": "9 Graphs with Noj", + "section": "9.1 Bar graphs", + "text": "9.1 Bar graphs\n\n(ns chapter-4-data-visualisation.noj-examples\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [scicloj.kindly.v4.kind :as kind]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok" }, { "objectID": "chapter_4_data_visualisation/noj_examples/index.html#raw-html", "href": "chapter_4_data_visualisation/noj_examples/index.html#raw-html", - "title": "10 Graphs with Noj", - "section": "10.2 Raw html", - "text": "10.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n vis/raw-html)\n\n\n\n\n\n\n\n(-> [:svg {:height 210\n :width 500}\n [:line {:x1 0\n :y1 0\n :x2 200\n :y2 200\n :style \"stroke:rgb(255,0,0);stroke-width:2\"}]]\n hiccup/html\n vis/raw-html)" + "title": "9 Graphs with Noj", + "section": "9.2 Raw html", + "text": "9.2 Raw html\n\n(-> \"<p>Hello, <i>Noj</i>.</p>\"\n kind/html)\n\n\nHello, Noj.\n\n\n(kind/html\n \"\n<svg height=100 width=100>\n<circle cx=50 cy=50 r=40 stroke='purple' stroke-width=3 fill='floralwhite' />\n</svg> \")" }, { "objectID": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami", "href": "chapter_4_data_visualisation/noj_examples/index.html#visualizing-datases-with-hanami", - "title": "10 Graphs with Noj", - "section": "10.3 Visualizing datases with Hanami", - "text": "10.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n10.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\nvega\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.2696595674516514\\n1,0.5994221672898448\\n2,0.9041662987177651\\n3,1.1641703504999699\\n4,1.606396428799537\\n5,1.3972382302814177\\n6,1.7686488303622263\\n7,1.8812856284088362\\n8,2.1521859934642023\\n9,1.761413935660772\\n10,1.5350565538499519\\n11,1.4760599735629056\\n12,1.2326873858637482\\n13,1.2742130826088063\\n14,0.9937616484523007\\n15,1.4130287588308725\\n16,1.16480354577581\\n17,0.6889384877674767\\n18,0.821314858587385\\n19,0.7473480777397288\\n\",\n :format {:type \"csv\"}}}\n\n\n\n10.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\nvega\n\n\n\n\n\n10.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\nvega\n\n\n\n\n\n10.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\nvega\n\n\n\n\n\n10.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\nvega\n\n\n\n\n\n10.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\nvega\n\n\n\n\n\n10.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\nvega\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\nvega\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj" + "title": "9 Graphs with Noj", + "section": "9.3 Visualizing datases with Hanami", + "text": "9.3 Visualizing datases with Hanami\nNoj offers a few convenience functions to make Hanami plotting work smoothly with Tablecloth and Kindly.\n\n(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.3.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200}))\n\n\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis/hanami-plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,0.25915143611932323\\n1,0.07679044186868467\\n2,-0.16838373926426764\\n3,-0.3472917379109737\\n4,-0.4185674782284593\\n5,-0.3275712090765166\\n6,0.06499031613330208\\n7,-0.12473464521100663\\n8,0.24581959605889236\\n9,0.3872343668945971\\n10,0.20630731645770806\\n11,0.4283007097190942\\n12,0.8577253018355132\\n13,1.029799282228336\\n14,1.500296189747702\\n15,1.802090709990422\\n16,1.675173594897049\\n17,1.5406670970402527\\n18,1.5912246361060238\\n19,1.7546356050436023\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.3.2 Additional Hanami templates\nThe scicloj.noj.v1.vis.hanami.templates namespace add Hanami templates to Hanami’s own collection.\n\n(-> datasets/mtcars\n (vis/hanami-plot vht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n\n\n\n9.3.3 Layers\n\n(-> random-walk\n (vis/hanami-layers\n {:TITLE \"points and a line\"}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\n\n\n\n\n9.3.4 Concatenation\n\n(-> random-walk\n (vis/hanami-vconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n(-> random-walk\n (vis/hanami-hconcat\n {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis/hanami-plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\n\n\n\n\n9.3.5 Linear regression\n\n(-> datasets/mtcars\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR \"purple\"\n :YTITLE :mpg})]))\n\n\n\n\n\n\n9.3.6 Histogram\n\n(-> datasets/iris\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))\n\n\n\n\n\n\n9.3.7 Combining a few things together\nThe following is inspired by the example at Plotnine’s main page. Note how we add regression lines here. We take care of layout and colouring on our side, not using Vega-Lite for that.\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/mtcars\n (tc/group-by :gear {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {:TITLE (str \"grear=\" group-name)}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :MCOLOR (pallete i)\n :HEIGHT 200\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :MCOLOR (pallete i)\n :YTITLE :mpg})]\n ))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nA similar example with histograms:\n\n(let [pallete (->> :accent\n color/palette\n (mapv color/format-hex))]\n (-> datasets/iris\n (tc/group-by :species {:result-type :as-map})\n (->> (sort-by key)\n (map-indexed\n (fn [i [group-name ds]]\n (-> ds\n (vis/hanami-histogram :sepal-width\n {:nbins 10}))))\n (vis/hanami-vconcat nil {}))))\n\n\n\n\nScatterplots and regression lines again, this time using Vega-Lite for layout and coloring (using its “facet” option).\n\n(-> datasets/mtcars\n (tc/group-by [:gear])\n (stats/add-predictions :mpg [:wt]\n {:model-type :smile.regression/ordinary-least-square})\n (tc/ungroup)\n (tc/select-columns [:gear :wt :mpg :mpg-prediction])\n (vis/hanami-layers {}\n [(vis/hanami-plot nil\n ht/point-chart\n {:X :wt\n :Y :mpg\n :MSIZE 200\n :COLOR \"gear\"\n :HEIGHT 100\n :WIDTH 200})\n (vis/hanami-plot nil\n ht/line-chart\n {:X :wt\n :Y :mpg-prediction\n :MSIZE 5\n :COLOR \"gear\"\n :YTITLE :mpg})])\n ((fn [spec]\n {:facet {:row {:field \"gear\"}}\n :spec (dissoc spec :data)\n :data (:data spec)}))\n kind/vega-lite)\n\n\n\n\n\n:bye\n\n\n:bye\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/noj_examples.clj" + }, + { + "objectID": "chapter_4_data_visualisation/4_2_graphs/index.html", + "href": "chapter_4_data_visualisation/4_2_graphs/index.html", + "title": "10 Graphs", + "section": "", + "text": "(ns chapter-4-data-visualisation.4-2-graphs\n (:require [tablecloth.api :as tc]\n [aerial.hanami.common :as hc]\n [aerial.hanami.templates :as ht]\n [scicloj.noj.v1.vis.hanami.templates :as vht]\n [scicloj.noj.v1.vis :as vis]\n [scicloj.noj.v1.stats :as stats]\n [scicloj.noj.v1.datasets :as datasets]\n [tech.v3.datatype :as dtype]\n [tech.v3.datatype.functional :as fun]\n [hiccup.core :as hiccup]\n [clojure2d.color :as color]\n [tablecloth.api :as tc]\n [scicloj.kind-clerk.api :as kind-clerk]))\n\n\n(kind-clerk/setup!)\n\n\n:ok\n\n\n(def co2-over-time (tc/dataset \"data/co2_over_time.csv\"))\n\n\n(-> co2-over-time\n (vis/hanami-plot ht/line-chart {:X \"Date\"\n :XTYPE \"temporal\"\n :WIDTH 750\n :Y \"adjusted CO2\"\n :YSCALE {:zero false}}))\n\n\n\n\n\n(def diamonds datasets/diamonds)\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :cut\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :color\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n(-> diamonds\n (vis/hanami-plot vht/boxplot-chart {:X :clarity\n :XTYPE \"nominal\"\n :Y :price\n :WIDTH 750}))\n\n\n\n\n\n\n:ok\n\n\n:ok\n\n\n\n\n\nsource: book/chapter_4_data_visualisation/4_2_graphs.clj" } ] \ No newline at end of filesource: book/index.clj
+source: book/index.clj