summaryrefslogtreecommitdiff
path: root/src/dompa/coordinates.clj
blob: 2cd0b0a089b63a24767a0a189f2d4e854032b86a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
(ns dompa.coordinates
  (:require [clojure.string :as str]))

(defn- construct-coordinates-reducer
  [{:keys [char-type start-idx coordinates] :as state} [idx c]]
  (cond
    ; we're undecided what to do next,
    ; so we figure it out here
    (nil? char-type)
    {:char-type (if (some #{c} "<>") :tag :text)
     :start-idx idx
     :coordinates coordinates}

    ; text ended, tag begins, which means we can
    ; record text node coordinates
    (and (= :text char-type)
         (= \< c))
    {:char-type :tag
     :start-idx idx
     :coordinates (conj coordinates [start-idx (dec idx)])}

    ; otherwise don't record anything, just note
    ; the start of a tag
    (and (not= :text char-type)
         (= \< c))
    {:char-type :tag
     :start-idx idx
     :coordinates coordinates}

    ; tag ended, record tag node coordinates
    (= \> c)
    {:char-type nil
     :start-idx idx
     :coordinates (conj coordinates [start-idx idx])}

    :else state))

(defn- construct-coordinates
  [indexed-html]
  (->> indexed-html
       (reduce construct-coordinates-reducer {:char-type nil :start-idx 0 :coordinates []})
       :coordinates))

(defn coordinates->tag-name [html [from to]]
  (-> (subs html from to)
      (str/split #"[\s\>]")
      first
      (str/replace #"[\<\>\/]" "")))

(defn- name-coordinates-fn [html]
  (fn [idx coordinate]
    [idx (coordinates->tag-name html coordinate)]))

(defn- last-coordinate-by-tag-name-idx [html coordinates name start]
  (let [filter-fn (fn [[_ end]] (< end start))
        filtered-coordinates (filter filter-fn coordinates)
        index-fn (name-coordinates-fn html)
        named-coordinates (map-indexed index-fn filtered-coordinates)]
    (->> named-coordinates
         (filter #(= name (-> % last)))
         last
         first)))

(defn- merge-coordinate [html coordinates [start end]]
  (let [name (coordinates->tag-name html [start end])
        matching-idx (last-coordinate-by-tag-name-idx html coordinates name start)
        [matching-start] (nth coordinates matching-idx)]
    (assoc coordinates matching-idx [matching-start end])))

(defn- merge-coordinates-reducer-fn [html]
  (fn [coordinates [start end]]
    (if (and (= \< (nth html start))
             (= \/ (nth html (inc start) nil)))
      (merge-coordinate html coordinates [start end])
      (conj coordinates [start end]))))

(defn merge-coordinates [html coordinates]
  (-> (merge-coordinates-reducer-fn html)
      (reduce [] coordinates)))

(defn children
  [coordinates [from to]]
  (->> coordinates
       (filter (fn [[iter-from iter-to]]
                 (and (< from iter-from)
                      (> to iter-to))))
       (sort-by first)))

(defn without-children
  [coordinates [parent-from parent-to]]
  (->> coordinates
       (remove (fn [[from to]]
                 (or (= from parent-from)
                     (and (> from parent-from)
                          (< to parent-to)))))))

(defn html->coordinates [html]
  (->> (map-indexed vector html)
       construct-coordinates
       (merge-coordinates html)))