blob: 2cd0b0a089b63a24767a0a189f2d4e854032b86a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
(ns dompa.coordinates
(:require [clojure.string :as str]))
(defn- construct-coordinates-reducer
[{:keys [char-type start-idx coordinates] :as state} [idx c]]
(cond
; we're undecided what to do next,
; so we figure it out here
(nil? char-type)
{:char-type (if (some #{c} "<>") :tag :text)
:start-idx idx
:coordinates coordinates}
; text ended, tag begins, which means we can
; record text node coordinates
(and (= :text char-type)
(= \< c))
{:char-type :tag
:start-idx idx
:coordinates (conj coordinates [start-idx (dec idx)])}
; otherwise don't record anything, just note
; the start of a tag
(and (not= :text char-type)
(= \< c))
{:char-type :tag
:start-idx idx
:coordinates coordinates}
; tag ended, record tag node coordinates
(= \> c)
{:char-type nil
:start-idx idx
:coordinates (conj coordinates [start-idx idx])}
:else state))
(defn- construct-coordinates
[indexed-html]
(->> indexed-html
(reduce construct-coordinates-reducer {:char-type nil :start-idx 0 :coordinates []})
:coordinates))
(defn coordinates->tag-name [html [from to]]
(-> (subs html from to)
(str/split #"[\s\>]")
first
(str/replace #"[\<\>\/]" "")))
(defn- name-coordinates-fn [html]
(fn [idx coordinate]
[idx (coordinates->tag-name html coordinate)]))
(defn- last-coordinate-by-tag-name-idx [html coordinates name start]
(let [filter-fn (fn [[_ end]] (< end start))
filtered-coordinates (filter filter-fn coordinates)
index-fn (name-coordinates-fn html)
named-coordinates (map-indexed index-fn filtered-coordinates)]
(->> named-coordinates
(filter #(= name (-> % last)))
last
first)))
(defn- merge-coordinate [html coordinates [start end]]
(let [name (coordinates->tag-name html [start end])
matching-idx (last-coordinate-by-tag-name-idx html coordinates name start)
[matching-start] (nth coordinates matching-idx)]
(assoc coordinates matching-idx [matching-start end])))
(defn- merge-coordinates-reducer-fn [html]
(fn [coordinates [start end]]
(if (and (= \< (nth html start))
(= \/ (nth html (inc start) nil)))
(merge-coordinate html coordinates [start end])
(conj coordinates [start end]))))
(defn merge-coordinates [html coordinates]
(-> (merge-coordinates-reducer-fn html)
(reduce [] coordinates)))
(defn children
[coordinates [from to]]
(->> coordinates
(filter (fn [[iter-from iter-to]]
(and (< from iter-from)
(> to iter-to))))
(sort-by first)))
(defn without-children
[coordinates [parent-from parent-to]]
(->> coordinates
(remove (fn [[from to]]
(or (= from parent-from)
(and (> from parent-from)
(< to parent-to)))))))
(defn html->coordinates [html]
(->> (map-indexed vector html)
construct-coordinates
(merge-coordinates html)))
|