Subset by Index in Clojure

In the previous post https://statcompute.wordpress.com/2018/03/23/subset-by-values-in-clojure, it’s been demonstrated how to subset by value in Clojure. In the example below, I would show how to subset by index by using the keep-indexed function. The key is to use the keep-indexed function locating the position of each lookup value in the country list and then subset the order list by positions, e.g. index.


(require '[clojure.pprint :as p]
         '[clojure.java.jdbc :as j])
 
(def db
  {:classname   "org.sqlite.JDBC"
   :subprotocol "sqlite"
   :subname     "/home/liuwensui/Downloads/chinook.db"})
 
(def orders (j/query db "select billingcountry as country, count(*) as orders from invoices group by billingcountry;"))
 
(def clist '("USA" "India" "Canada" "France")

(def country (map #(get % :country) orders))

(p/print-table 
  (map #(nth orders %) 
    (flatten (pmap (fn [c] (keep-indexed (fn [i v] (if (= v c) i)) country)) clist))))

;| :country | :orders |
;|----------+---------|
;|      USA |      91 |
;|    India |      13 |
;|   Canada |      56 |
;|   France |      35 |

Updating Column Values in Clojure Map


(require '[huri.core :as h]
         '[clojure.core.matrix.dataset :as d]
         '[incanter.core :as i])

(def ds [{:id 1.0 :name "name1"}
         {:id 2.0 :name "name2"}
         {:id 3.0 :name "name3"}])

;; UPDATE THE :NAME COLUMN IN THE DATASET
;; - IF THE VALUE IS NOT "NAME2", THEN CHANGE TO "NOT 2"
;;
;; EXPECTED OUTPUT:
;; | :id | :name |
;; |-----+-------|
;; | 1.0 | not 2 |
;; | 2.0 | name2 |
;; | 3.0 | not 2 |

;; WITH CLOJURE.CORE/UPDATE
(def d1 (map (fn [x] (update x :name #(if (= "name2" %) % "not 2"))) ds))

;; WITH CLOJURE.CORE/UPDATE-IN
(def d2 (map (fn [x] (update-in x [:name] #(if (= "name2" %) % "not 2"))) ds))

;; WITH HURI/UPDATE-COLS
(def d3 (h/update-cols {:name #(if (= "name2" %) % "not 2")} ds))

;; WITH MATRIX.DATASET/EMAP-COLUMN
(def d4 (-> ds
            (d/dataset)
            (d/emap-column :name #(if (= "name2" %) % "not 2"))
            ((comp #(map into %) d/row-maps))))
   
;; WITH INCANTER/TRANSFORM-COL
(def d5 (-> ds
            (i/to-dataset)
            (i/transform-col :name #(if (= "name2" %) % "not 2"))
            ((comp #(map into %) second vals))))

Adding New Columns to Clojure Map


(require '[huri.core :as h]
         '[clojure.core.matrix.dataset :as d]
         '[incanter.core :as i])

(def ds [{:id 1.0 :name "name1"}
         {:id 2.0 :name "name2"}
         {:id 3.0 :name "name3"}])

;; ADD 2 COLUMNS TO THE DATASET
;; - ADD 2 TO ID AND NAME ADD2
;; - CHECK NAME = "name2" AND NAME NAME2
;;
;; EXPECTED OUTPUT:
;;| :id | :name | :add2 | :name2 |
;;|-----+-------+-------+--------|
;;| 1.0 | name1 |   3.0 |      N |
;;| 2.0 | name2 |   4.0 |      Y |
;;| 3.0 | name3 |   5.0 |      N |

;; WITH PLAIN CLOJURE
;; #1 - MERGE
(def d1 (map #(merge % {:add2 (+ (:id %) 2) 
                        :name2 (if (= "name2" (:name %)) "Y" "N")}) ds))

;; #2 - MERGE-WITH
(def d2 (map #(merge-with into % {:add2 (+ (:id %) 2)
                                  :name2 (if (= "name2" (:name %)) "Y" "N")}) ds))

;; #3 - ASSOC
(def d3 (map #(assoc % :add2 (+ (:id %) 2) 
                       :name2 (if (= "name2" (:name %)) "Y" "N")) ds))

;; #4 - CONJ
(def d4 (map #(conj % {:add2 (+ (:id %) 2)
                       :name2 (if (= "name2" (:name %)) "Y" "N")}) ds))

;; #5 - CONCAT 
(def d5 (map #(into {} (concat % {:add2 (+ (:id %) 2)
                                  :name2 (if (= "name2" (:name %)) "Y" "N")})) ds))

;; WITH HURI 
(def d6 (h/derive-cols {:name2 [#(if (= "name2" %) "Y" "N") :name] 
                        :add2 [#(+ 2  %) :id]} ds))

;; WITH CORE.MATRIX API
(def d7 (-> ds
            (d/dataset)
            (d/add-column :add2 (map #(+ 2 %) (map :id ds)))
            (d/add-column :name2 (map #(if (= "name2" %) "Y" "N") (map :name ds)))
            (d/row-maps)))

;; WITH INCANTER API
(def d8 (->> ds
             (i/to-dataset)
             (i/add-derived-column :add2 [:id] #(+ 2 %))
             (i/add-derived-column :name2 [:name] #(if (= "name2" %) "Y" "N"))
             ((comp second vals))))

;; CHECK THE DATA EQUALITY
(= d1 d2 d3 d4 d5 d6 d7 d8)
;; true

Transpose in Clojure


(require '[huri.core :as h]
         '[clojure.core.matrix.dataset :as d]
         '[incanter.core :as i])

;; FROM MAP OF ROWS TO MAP OF COLUMNS

(def byRow [{:x 1 :y "a"}
            {:x 2 :y "b"}
            {:x 3 :y "c"}])

;; APPROACH #1 - PLAIN CLOJURE
(zipmap (keys (first byRow)) (apply map list (map vals byRow)))

; {:x (1 2 3), :y ("a" "b" "c")}

;; APPROACH #2 - HURI LIBRARY
(h/col-oriented byRow)

; {:x (1 2 3), :y ("a" "b" "c")}

;; APPROACH #3 - CORE.MATRIX LIBRARY
(d/to-map (d/dataset (keys (first byRow)) byRow))

; {:x [1 2 3], :y ["a" "b" "c"]}

;; APPROACH #4 - INCANTER LIBRARY
(i/to-map (i/to-dataset byRow))

; {:x (1 2 3), :y ("a" "b" "c")}

;; FROM MAP OF COLUMNS TO MAP OF ROWS

(def byCol {:x '(1 2 3)
            :y '("a" "b" "c")})

;; APPROACH #1 - PLAIN CLOJURE
(map #(zipmap (keys byCol) %) (apply map list (vals byCol)))

; ({:x 1, :y "a"} {:x 2, :y "b"} {:x 3, :y "c"})

;; APPROACH #2 - HURI LIBRARY
(h/row-oriented byCol)

; ({:x 1, :y "a"} {:x 2, :y "b"} {:x 3, :y "c"})

;; APPROACH #3 - CORE.MATRIX LIBRARY
(d/row-maps (d/dataset (keys byCol) byCol))

; [{:x 1, :y "a"} {:x 2, :y "b"} {:x 3, :y "c"}]

;; APPROACH #4 - INCANTER LIBRARY
(second (vals (i/dataset (keys byCol) (apply map list (vals byCol)))))

; ({:x 1, :y "a"} {:x 2, :y "b"} {:x 3, :y "c"})

Clojure Integration with R


(require '[tnoda.rashinban :as rr]
         '[tnoda.rashinban.core :as rc]
         '[clojure.core.matrix.dataset :as dt]
         '[clojure.core.matrix.impl.dataset :as id])

;; CREATE A TOY DATA
(def ds [{:id 1.0 :name "name1"}
         {:id 2.0 :name "name2"}
         {:id 3.0 :name "name3"}])

;; RUN THE FOLLOWING R CODE IN ADVANCE TO START THE RSERVE SERVER:
;;   R -e 'library(Rserve)' -e 'Rserve(args = "--vanilla")'
;; IF YOU HAVE LITTLER INSTALLED, BELOW ALSO WORKS:
;;   r -e 'library(Rserve); Rserve(args = "--vanilla")'  
(rr/init)

;; PASS THE DATA FROM CLOJURE INTO R
(map (fn [x] (rr/<- (name (key x)) (val x))) 
  (let [ks ((comp keys first) ds)] (zipmap ks (map #(map % ds) ks))))

(rr/<- 'header (map name ((comp keys first) ds)))
         
;; CREATE THE R DATA.FRAME         
(rc/eval "df = data.frame(lapply(header, as.name))")

;; TEST THE R DATA.FRAME
(rc/eval "df$id")
; [1.0 2.0 3.0]

(rc/eval "df$name")
; ["name1" "name2" "name3"]

;; CONVERT THE R DATA.FRAME BACK TO THE CLOJURE MAP
(def mp (into [] (map #(zipmap (map keyword (rr/colnames 'df)) %) 
                   (partition (count (rr/colnames 'df)) (apply interleave (rr/matrix 'df))))))

; [{:id 1.0, :name "name1"} {:id 2.0, :name "name2"} {:id 3.0, :name "name3"}]

;; TEST THE EQUALITY BETWEEN INPUT AND OUTPUT DATA
(= mp ds)
; true

;; ALTERNATIVELY, WE CAN ALSO CONVERT THE R DATA.FRAME TO A CLOJURE DATASET
(def dt (id/dataset-from-columns (map keyword (rr/colnames 'df)) (rr/matrix 'df)))

; #dataset/dataset {:column-names [:id :name], :columns [[1.0 2.0 3.0] ["name1" "name2" "name3"]], :shape [3 2]}

;; NEXT, CONVERT THE DATASET TO THE MAP
(def mp2 (dt/row-maps dt))

; [{:id 1.0, :name "name1"} {:id 2.0, :name "name2"} {:id 3.0, :name "name3"}]

(= ds mp2)
; true

Aggregation by Multiple Keys in Clojure


(require '[ultra-csv.core :refer [read-csv]]
         '[criterium.core :refer [quick-bench]]
         '[clojure.set :refer [index]])

(def ds (read-csv "/home/liuwensui/Downloads/nycflights.csv"))

;; FASTEST
(quick-bench
  (map
    (fn [x] {:year (first (key x))
             :month (last (key x))
             :flights (count (val x))})
      (group-by (juxt :year :month) ds)))      

;Evaluation count : 6 in 6 samples of 1 calls.
;             Execution time mean : 712.329182 ms
;    Execution time std-deviation : 3.832950 ms
;   Execution time lower quantile : 709.135737 ms ( 2.5%)
;   Execution time upper quantile : 718.651856 ms (97.5%)
;                   Overhead used : 11.694357 ns

;; WORKS FINE
(quick-bench
  (map
    (fn [x] {:year (:year (key x))
             :month (:month (key x))
             :flights (count (val x))})
      (group-by #(select-keys % [:year :month]) ds)))
      
;Evaluation count : 6 in 6 samples of 1 calls.
;             Execution time mean : 1.485215 sec
;    Execution time std-deviation : 9.832209 ms
;   Execution time lower quantile : 1.476116 sec ( 2.5%)
;   Execution time upper quantile : 1.500560 sec (97.5%)
;                   Overhead used : 11.694357 ns

;; SLOWEST
(quick-bench
  (map
    (fn [x] {:year (:year (key x))
             :month (:month (key x))
             :flights (count (val x))})
      (index ds [:year :month])))
      
;Evaluation count : 6 in 6 samples of 1 calls.
;             Execution time mean : 2.158245 sec
;    Execution time std-deviation : 11.208489 ms
;   Execution time lower quantile : 2.149538 sec ( 2.5%)
;   Execution time upper quantile : 2.175743 sec (97.5%)
;                   Overhead used : 11.694357 ns

Inner and Outer Joins in Clojure


(require '[clojure.pprint :refer [print-table] :rename {print-table p}]
         '[clojure.set :as s]
         '[clojure.core.reducers :as r])

;; CREATE TOY DATASETS                 
(def ds1 [{:id 1 :name "name1"}
          {:id 2 :name "name2"}
          {:id 3 :name "name3"}])
          
(def ds2 [{:id 2 :address "addr2"}
          {:id 3 :address "addr3"}
          {:id 4 :address "addr4"}])

;; GET THE HEADER
(def ks ((comp distinct flatten) (map #((comp keys first) %) [ds1 ds2])))

;; INNER JOIN WITH SET/JOIN
(p ks
  (s/join ds1 ds2))

;| :id | :name | :address |
;|-----+-------+----------|
;|   3 | name3 |    addr3 |
;|   2 | name2 |    addr2 |

;; OUTER JOIN #1
(p ks (map #(apply merge %) (vals (group-by :id (concat ds1 ds2)))))

;| :id | :name | :address |
;|-----+-------+----------|
;|   1 | name1 |          |
;|   2 | name2 |    addr2 |
;|   3 | name3 |    addr3 |
;|   4 |       |    addr4 |

;; OUTER JOIN #2 -- AN EXAMPLE OF USING REDUCERS
(p ks (into () (r/map #(r/reduce merge %) (vals (s/index (s/union ds2 ds1) [:id])))))

;| :id | :name | :address |
;|-----+-------+----------|
;|   1 | name1 |          |
;|   4 |       |    addr4 |
;|   3 | name3 |    addr3 |
;|   2 | name2 |    addr2 |
 
 ;; OUTER JOIN #3 -- USE LET CREATING LOCAL VARIABLES
(p ks (let [z1 (zipmap (map :id ds1) ds1) 
            z2 (zipmap (map :id ds2) ds2)]
        (vals (merge-with merge z1 z2))))

;| :id | :name | :address |
;|-----+-------+----------|
;|   1 | name1 |          |
;|   2 | name2 |    addr2 |
;|   3 | name3 |    addr3 |
;|   4 |       |    addr4 |

By-Group Statistical Summary in Clojure

In the previous post (https://statcompute.wordpress.com/2018/03/16/for-loop-and-map-in-clojure), I did a performance comparison between MAP and FOR loop in Clojure with a small dataset. It is interesting to see that the performance of PMAP, e.g. parallel MAP, is considerably below the performance of MAP and FOR loop, which is quite counter-intuitive.

Today, I employed a relatively large dataset with 0.3 million records to perform a by-group statistical summary. In the example, five different approaches were experimented, including MAP, PMAP, Reducer MAP, FOR loop, and LOOP/RECUR. As shown below, PMAP is at least 30% more efficient than the rest in this particular case.


(require '[clojure.pprint :as p]
         '[ultra-csv.core :as u]
         '[clj-statistics-fns.core :as s]
         '[clojure.core.reducers :as r])


(def ds (u/read-csv "/home/liuwensui/Downloads/nycflights.csv"))


;; SHOW HEADERS OF THE DATA
(prn (keys (first ds)))

; (:day :hour :tailnum :arr_time :month :dep_time :carrier :arr_delay :year :dep_delay :origin :flight :distance :air_time :dest :minute)


;; PRINT A DATA SAMPLE
(p/print-table
  (map #(select-keys % [:origin :dep_delay]) (take 3 ds)))

; | :origin | :dep_delay |
; |---------+------------|
; |     EWR |          2 |
; |     LGA |          4 |
; |     JFK |          2 |


;; APPROACH #1: MAP()
(time
  (p/print-table
    (map
      (fn [x] {:origin (first x)
               :freq (format "%,8d" (count (second x)))
               :nmiss (format "%,8d" (count (filter nil? (map #(get % :dep_delay) (second x)))))
               :med_delay (format "%,8d" (s/median (remove nil? (map #(get % :dep_delay) (second x)))))
               :75q_delay (format "%,8d" (s/kth-percentile 75 (remove nil? (map #(get % :dep_delay) (second x)))))
               :max_delay (format "%,8d" (reduce max (remove nil? (map #(get % :dep_delay) (second x)))))})
        (group-by :origin ds))))

; | :origin |    :freq |   :nmiss | :med_delay | :75q_delay | :max_delay |
; |---------+----------+----------+------------+------------+------------|
; |     EWR |  120,835 |    3,239 |         -1 |         15 |      1,126 |
; |     LGA |  104,662 |    3,153 |         -3 |          7 |        911 |
; |     JFK |  111,279 |    1,863 |         -1 |         10 |      1,301 |
; "Elapsed time: 684.71396 msecs"


;; APPROACH #2: PMAP()
(time
  (p/print-table
    (pmap
      (fn [x] {:origin (first x)
               :freq (format "%,8d" (count (second x)))
               :nmiss (format "%,8d" (count (filter nil? (map #(get % :dep_delay) (second x)))))
               :med_delay (format "%,8d" (s/median (remove nil? (map #(get % :dep_delay) (second x)))))
               :75q_delay (format "%,8d" (s/kth-percentile 75 (remove nil? (map #(get % :dep_delay) (second x)))))
               :max_delay (format "%,8d" (reduce max (remove nil? (map #(get % :dep_delay) (second x)))))})
        (group-by :origin ds))))

; | :origin |    :freq |   :nmiss | :med_delay | :75q_delay | :max_delay |
; |---------+----------+----------+------------+------------+------------|
; |     EWR |  120,835 |    3,239 |         -1 |         15 |      1,126 |
; |     LGA |  104,662 |    3,153 |         -3 |          7 |        911 |
; |     JFK |  111,279 |    1,863 |         -1 |         10 |      1,301 |
; "Elapsed time: 487.065551 msecs"


;; APPROACH #3: REDUCER MAP()
(time
  (p/print-table
    (into ()
      (r/map
        (fn [x] {:origin (first x)
                 :freq (format "%,8d" (count (second x)))
                 :nmiss (format "%,8d" (count (filter nil? (pmap #(get % :dep_delay) (second x)))))
                 :med_delay (format "%,8d" (s/median (remove nil? (pmap #(get % :dep_delay) (second x)))))
                 :75q_delay (format "%,8d" (s/kth-percentile 75 (remove nil? (pmap #(get % :dep_delay) (second x)))))
                 :max_delay (format "%,8d" (reduce max (remove nil? (pmap #(get % :dep_delay) (second x)))))})
          (group-by :origin ds)))))

; | :origin |    :freq |   :nmiss | :med_delay | :75q_delay | :max_delay |
; |---------+----------+----------+------------+------------+------------|
; |     JFK |  111,279 |    1,863 |         -1 |         10 |      1,301 |
; |     LGA |  104,662 |    3,153 |         -3 |          7 |        911 |
; |     EWR |  120,835 |    3,239 |         -1 |         15 |      1,126 |
; "Elapsed time: 3734.039994 msecs"


;; APPROACH #4: LIST COMPREHENSION
(time
  (p/print-table
    (for [g (group-by :origin ds)]
      ((fn [x] {:origin (first x)
                :freq (format "%,8d" (count (second x)))
                :nmiss (format "%,8d" (count (filter nil? (map #(get % :dep_delay) (second x)))))
                :med_delay (format "%,8d" (s/median (remove nil? (map #(get % :dep_delay) (second x)))))
                :75q_delay (format "%,8d" (s/kth-percentile 75 (remove nil? (map #(get % :dep_delay) (second x)))))
                :max_delay (format "%,8d" (reduce max (remove nil? (map #(get % :dep_delay) (second x)))))})
        g))))

; | :origin |    :freq |   :nmiss | :med_delay | :75q_delay | :max_delay |
; |---------+----------+----------+------------+------------+------------|
; |     EWR |  120,835 |    3,239 |         -1 |         15 |      1,126 |
; |     LGA |  104,662 |    3,153 |         -3 |          7 |        911 |
; |     JFK |  111,279 |    1,863 |         -1 |         10 |      1,301 |
; "Elapsed time: 692.411023 msecs"


;; APPROACH #5: LOOP/RECUR
(time
  (p/print-table
    (loop [i (group-by :origin ds) result '()]
      (if ((complement empty?) i)
        (recur
          (rest i)
          (conj result {:origin (first (first i))
                        :freq (format "%,8d" (count (second (first i))))
                        :nmiss (format "%,8d" (count (filter nil? (map #(get % :dep_delay) (second (first i))))))
                        :med_delay (format "%,8d" (s/median (remove nil? (map #(get % :dep_delay) (second (first i))))))
                        :75q_delay (format "%,8d" (s/kth-percentile 75 (remove nil? (map #(get % :dep_delay) (second (first i))))))
                        :max_delay (format "%,8d" (reduce max (remove nil? (map #(get % :dep_delay) (second (first i))))))}))
        result))))

; | :origin |    :freq |   :nmiss | :med_delay | :75q_delay | :max_delay |
; |---------+----------+----------+------------+------------+------------|
; |     JFK |  111,279 |    1,863 |         -1 |         10 |      1,301 |
; |     LGA |  104,662 |    3,153 |         -3 |          7 |        911 |
; |     EWR |  120,835 |    3,239 |         -1 |         15 |      1,126 |
; "Elapsed time: 692.717104 msecs"

Subset by Values in Clojure


(require '[clojure.pprint :as p]
         '[clojure.java.jdbc :as j])

(def db
  {:classname   "org.sqlite.JDBC"
   :subprotocol "sqlite"
   :subname     "/home/liuwensui/Downloads/chinook.db"})

(def orders (j/query db "select billingcountry as country, count(*) as orders from invoices group by billingcountry;"))

(def clist '("USA" "India" "Canada" "France"))

;; APPROACH #1: LIST COMPREHENSION
(p/print-table 
  (flatten (for [c clist] (for [o orders :when (= c (:country o))] o))))

;| :country | :orders |
;|----------+---------|
;|      USA |      91 |
;|    India |      13 |
;|   Canada |      56 |
;|   France |      35 |

;; APPROACH #2: FILTER FUNCTION
(p/print-table 
  (flatten (map (fn [c] (filter #(= (:country %) c) orders)) clist)))

;| :country | :orders |
;|----------+---------|
;|      USA |      91 |
;|    India |      13 |
;|   Canada |      56 |
;|   France |      35 |

;; APPROACH #3: SET/JOIN FUNCTION
(p/print-table 
  (clojure.set/join orders (into () (for [c clist] {:country c}))))

;| :country | :orders |
;|----------+---------|
;|      USA |      91 |
;|   France |      35 |
;|    India |      13 |
;|   Canada |      56 |

;; APPROACH #4: SET/SELECT FUNCTION
(p/print-table
   (map (fn [c] (into {} (clojure.set/select #(= (:country %) c) (set orders)))) clist))

;| :country | :orders |
;|----------+---------|
;|      USA |      91 |
;|    India |      13 |
;|   Canada |      56 |
;|   France |      35 |

;; APPROACH #5: REDUCER FUNCTIONS
(require '[clojure.core.reducers :as r])

(p/print-table 
  (into () (r/mapcat (fn [c] (r/filter #(= (:country %) c) orders)) clist)))

;| :country | :orders |
;|----------+---------|
;|   France |      35 |
;|   Canada |      56 |
;|    India |      13 |
;|      USA |      91 |

Do We Really Need Dataframe in Clojure?

During my learning of Clojure, I was deeply impressed by the richness of data structures and the built-in support for concurrency, which makes it a perfect programming language for the data science. If you simply compare between R base and Clojure core functions, Clojure wins hands down!

At the beginning, a question that I kept asking myself was why there isn’t something in Clojure similar to R data frame or Python DataFrame. I had experimented Incanter API that is currently dying and was disappointed by its performance. However, after diving deeper, I started wondering whether we really need any additional data structure, such as something similar to the dataframe, just for the purpose of data munging.

Below is an example showing how to aggregate and query data with generic Clojure data structures, e.g. lazyseq and map, and core functions. For the time being, I am somewhat convinced that the life is still good, even without dataframe.


(require '[clojure.pprint :as p]
         '[clojure.java.jdbc :as j])

(def db
  {:classname   "org.sqlite.JDBC"
   :subprotocol "sqlite"
   :subname     "/home/liuwensui/Downloads/chinook.db"})

(pprint (j/query db "select * from invoices limit 1;"))

; ({:invoiceid 1,
;  :customerid 2,
;  :invoicedate "2009-01-01 00:00:00",
;  :billingaddress "Theodor-Heuss-Straße 34",
;  :billingcity "Stuttgart",
;  :billingstate nil,
;  :billingcountry "Germany",
;  :billingpostalcode "70174",
;  :total 1.98})

(def inv (j/query db "select * from invoices;"))

;; AGGREGATE INVOICE TOTAL BY COUNTRIES
(def country_sum 
  (map (fn [[billingcountry total]]
    {:billiingcountry billingcountry :total (reduce + (map :total total))})
    (group-by :billingcountry inv)))

;; TOP 5 COUNTRIES BY INVOICE AMOUNTS 
(p/print-table (take 5 (reverse (sort-by :total country_sum))))

; | :billiingcountry |             :total |
; |------------------+--------------------|
; |              USA |  523.0600000000003 |
; |           Canada |  303.9599999999999 |
; |           France | 195.09999999999994 |
; |           Brazil | 190.09999999999997 |
; |          Germany |             156.48 |

;; SELECT ROWS BY CRITERIA, E.G. US ORDERS BETWEEN $10 AND $12
(def us_inv (filter #(and (= (:billingcountry %) "USA") (< 10 (:total %) 12)) inv))

;; LIST ORDERS MEETING CRITERIA
(pprint us_inv)

;({:invoiceid 298,
;  :customerid 17,
;  :invoicedate "2012-07-31 00:00:00",
;  :billingaddress "1 Microsoft Way",
;  :billingcity "Redmond",
;  :billingstate "WA",
;  :billingcountry "USA",
;  :billingpostalcode "98052-8300",
;  :total 10.91}
; {:invoiceid 311,
;  :customerid 28,
;  :invoicedate "2012-09-28 00:00:00",
;  :billingaddress "302 S 700 E",
;  :billingcity "Salt Lake City",
;  :billingstate "UT",
;  :billingcountry "USA",
;  :billingpostalcode "84102",
;  :total 11.94})

;; SELECT COLUMNS, E.G. STATES AND CITIES
(p/print-table (map #(select-keys % [:invoiceid :billingcountry :billingstate :billingcity]) us_inv))

; | :invoiceid | :billingcountry | :billingstate |   :billingcity |
; |------------+-----------------+---------------+----------------|
; |        298 |             USA |            WA |        Redmond |
; |        311 |             USA |            UT | Salt Lake City |

Parse CSV File with Headers in Clojure


; (defproject prj "0.1.0-SNAPSHOT"
; :dependencies [[org.clojure/clojure "1.8.0"]
; [org.clojars.bmabey/csvlib "0.3.6"]
; [ultra-csv "0.2.1"]
; [incanter "1.5.7"]
; [semantic-csv "0.2.1-alpha1"]
; [org.clojure/data.csv "0.1.4"]])

(require '[csvlib :as csvlib]
         '[ultra-csv.core :as ultra]
         '[semantic-csv.core :as semantic]
         '[clojure-csv.core :as csv]
         '[clojure.java.io :as io]
         '[incanter.core :as ic]
         '[incanter.io :as ii])

;; INCANTER PACKAGE
(time
(ic/$ (range 0 3) '(:dest :origin :hour)
(ii/read-dataset "/home/liuwensui/Downloads/nycflights.csv" :header true :delim \,)))

; "Elapsed time: 14085.382359 msecs"
; | :dest | :origin | :hour |
; |-------+---------+-------|
; | IAH | EWR | 5 |
; | IAH | LGA | 5 |
; | MIA | JFK | 5 |

;; CSVLIB PACKAGE
(time
(ic/$ (range 0 3) '(:dest :origin :hour)
(ic/to-dataset
(csvlib/read-csv "/home/liuwensui/Downloads/nycflights.csv" :headers? true))))

; "Elapsed time: 0.933955 msecs"
; | dest | origin | hour |
; |------+--------+------|
; | IAH | EWR | 5 |
; | IAH | LGA | 5 |
; | MIA | JFK | 5 |

;; ULTRA-CSV PACKAGE
(time
(ic/$ (range 0 3) '(:dest :origin :hour)
(ic/to-dataset
(ultra/read-csv "/home/liuwensui/Downloads/nycflights.csv"))))

; "Elapsed time: 30.599593 msecs"
; | :dest | :origin | :hour |
; |-------+---------+-------|
; | IAH | EWR | 5 |
; | IAH | LGA | 5 |
; | MIA | JFK | 5 |

;; SEMANTIC-CSV PACKAGE
(time
(ic/$ (range 0 3) '(:dest :origin :hour)
(ic/to-dataset
(with-open [in-file (io/reader "/home/liuwensui/Downloads/nycflights.csv")]
(->> (csv/parse-csv in-file) semantic/mappify doall)))))

; "Elapsed time: 8338.366317 msecs"
; | :dest | :origin | :hour |
; |-------+---------+-------|
; | IAH | EWR | 5 |
; | IAH | LGA | 5 |
; | MIA | JFK | 5 |

For Loop and Map in Clojure


;; DEFINE THE DATABASE
(def db
  {:classname   "org.sqlite.JDBC"
   :subprotocol "sqlite"
   :subname     "/home/liuwensui/Downloads/chinook.db"})

;; CALL PACKAGES
(require '[clojure.java.jdbc :as j]
         '[criterium.core :as c]
         '[clojure.core.reducers :as r])

;; DEFINE THE QUERY TO PULL THE TABLE LIST
(def qry (str "select tbl_name as tbl from sqlite_master where type = 'table';"))

;; PULL THE TABLE LIST AND CONVERT IT TO A LIST
(def tbl (apply list (for [i (j/query db qry)] (get i :tbl))))

;; PRINT 5 RECORDS OF THE TABLE LIST
(doseq [i (take 5 tbl)] (prn i))
; "albums"
; "sqlite_sequence"
; "artists"
; "customers"
; "employees"

;; DEFINE A FUNCTION TO COUNT RECORDS IN A TABLE
(defn cnt [x] (j/query db (str "select '" x "' as tbl, count(*) as cnt from " x ";")))

;; TEST THE FUNCTION
(cnt "albums")
; ({:tbl "albums", :cnt 347})

;; FOR LOOP
(c/quick-bench (for [i tbl] (cnt i)))
; Execution time mean : 14.156879 ns

;; MAP FUNCTION
(c/quick-bench (map cnt tbl))
; Execution time mean : 15.623790 ns

;; PMAP FUNCTION - PARALLEL MAP
(c/quick-bench (pmap cnt tbl))
; Execution time mean : 456.780027 µs

;; REDUCERS MAP FUNCTION
(defn rmap [f l] (into () (r/map f l)))
(c/quick-bench (rmap cnt tbl))
; Execution time mean : 8.376753 ms

Clojure and SQLite

First of all, we need to define the database specification.


(def db
  {:classname   "org.sqlite.JDBC"
   :subprotocol "sqlite"
   :subname     "/home/liuwensui/Downloads/chinook.db"})

We can use JDBC to query data from the table, which is the same approach mentioned in “Clojure Data Analysis Cookbook”.


;; project.clj
;; (defproject prj "0.1.0-SNAPSHOT"
;;   :dependencies [[org.clojure/clojure "1.8.0"]
;;                  [org.clojure/java.jdbc "0.7.5"]
;;                  [org.xerial/sqlite-jdbc "3.7.2"]
;; ])

(require '[clojure.pprint :as p]
         '[clojure.java.jdbc :as j])

(p/print-table (j/query db (str "select tbl_name, type from sqlite_master where type = 'table' limit 3;")))

;; |       :tbl_name | :type |
;; |-----------------+-------|
;; |          albums | table |
;; | sqlite_sequence | table |
;; |         artists | table |

Alternatively, we can also use the ClojureQL package, as shown below.


;; project.clj
;; (defproject prj "0.1.0-SNAPSHOT"
;;   :dependencies [[org.clojure/clojure "1.8.0"]
;;                  [clojureql "1.0.5"]
;;                  [org.xerial/sqlite-jdbc "3.7.2"]
;; ])

(require '[clojure.pprint :as p]
         '[clojureql.core :as l])

(p/print-table @(-> (l/select (l/table db :sqlite_master) (l/where (= :type "table")))
                    (l/take 3)
                    (l/project [:tbl_name :type])))

;; | :type |       :tbl_name |
;; |-------+-----------------|
;; | table |          albums |
;; | table | sqlite_sequence |
;; | table |         artists |

After the data import, we can easily convert it to incanter dataset or clojure map for further data munging.

Two Ways to Select Rows in Incanter

user=> (use '(incanter core io))
nil

user=> (def iris (read-dataset "../data/iris.dat" :header true :delim \space))
#'user/iris

user=> (sel iris :rows (range 3))
[:Sepal.Length :Sepal.Width :Petal.Length :Petal.Width :Species]
[5.1 3.5 1.4 0.2 "setosa"]
[4.9 3.0 1.4 0.2 "setosa"]
[4.7 3.2 1.3 0.2 "setosa"]

;; METHOD 1 - USING $WHERE 
user=> ($where {:Species {:in #{"virginica" "setosa"}} :Sepal.Length {:gt 5.5, :lt 6.0}} iris)
[:Sepal.Length :Sepal.Width :Petal.Length :Petal.Width :Species]
[5.8 4.0 1.2 0.2 "setosa"]
[5.7 4.4 1.5 0.4 "setosa"]
[5.7 3.8 1.7 0.3 "setosa"]
[5.8 2.7 5.1 1.9 "virginica"]
[5.7 2.5 5.0 2.0 "virginica"]
[5.8 2.8 5.1 2.4 "virginica"]
[5.6 2.8 4.9 2.0 "virginica"]
[5.8 2.7 5.1 1.9 "virginica"]
[5.9 3.0 5.1 1.8 "virginica"]

;; METHOD 2 - USING QUERY-DATASET
user=> (query-dataset iris {:Species {:in #{"virginica" "setosa"}} :Sepal.Length {:gt 5.5, :lt 6.0}})
[:Sepal.Length :Sepal.Width :Petal.Length :Petal.Width :Species]
[5.8 4.0 1.2 0.2 "setosa"]
[5.7 4.4 1.5 0.4 "setosa"]
[5.7 3.8 1.7 0.3 "setosa"]
[5.8 2.7 5.1 1.9 "virginica"]
[5.7 2.5 5.0 2.0 "virginica"]
[5.8 2.8 5.1 2.4 "virginica"]
[5.6 2.8 4.9 2.0 "virginica"]
[5.8 2.7 5.1 1.9 "virginica"]
[5.9 3.0 5.1 1.8 "virginica"]

Read CSV Data File with Clojure

;; LOAD PACKAGES
user=> (use '(incanter core io))
nil

;; READ CSV FILE
user=> (def ds1 (read-dataset "../data/credit_count.csv" :header true :delim \,))
#'user/ds1

;; EXAMINE THE DATA
user=> ($ (range 0 3) :all ds1)
[:CARDHLDR :DEFAULT :AGE :ACADMOS :ADEPCNT :MAJORDRG :MINORDRG :OWNRENT :INCOME :SELFEMPL :INCPER :EXP_INC :SPENDING :LOGSPEND ]
[0 0 27.25 4 0 0 0 0 1200 0 18000 6.667E-4 " " "  "]
[0 0 40.8333321 111 3 0 0 1 4000 0 13500 2.222E-4 " " "  "]
[1 0 37.6666679 54 3 0 0 1 3666.6666667 0 11300 0.0332699 121.9896773 4.8039364]

;; CALCULATE SUMMARY BY GROUP
user=> ($rollup :mean :INCOME [:CARDHLDR :DEFAULT] ds1)
[:DEFAULT :CARDHLDR :INCOME]
[1 1 2156.117553547691]
[0 1 2653.2908642884945]
[0 0 2165.1530843234673]

Read Space-Delimited File with Clojure

;; load clojure libraries
user=> (use 'incanter.core 'incanter.io)
nil

;; import data file
user=> (def iris (read-dataset "../data/iris.dat" :header true :delim \space))
#'user/iris

;; show data header
user=> (col-names iris)
[:Sepal.Length :Sepal.Width :Petal.Length :Petal.Width :Species]

;; show first 3 data records with $
user=> ($ (range 0 3) :all iris)
[:Sepal.Length :Sepal.Width :Petal.Length :Petal.Width :Species]
[5.1 3.5 1.4 0.2 "setosa"]
[4.9 3.0 1.4 0.2 "setosa"]
[4.7 3.2 1.3 0.2 "setosa"]