1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
|
(ns dompa.coordinates
(:require [clojure.string :as str]))
(defn- text-ended-and-tag-begins? [char char-type]
(and (= :text char-type)
(= \< char)))
(defn- text-ended-by-html-ending? [char-type total-char-count idx]
(and (= :text char-type)
(= (dec total-char-count) idx)))
(defn- tag-starts? [char char-type]
(and (not= :text char-type)
(= \< char)))
(defn- compose-reducer-fn
"Returns a reducer function with initial state of
`total-char-count` integer."
[total-char-count]
(fn [{:keys [char-type start-idx coordinates] :as state} [idx c]]
(cond
; we're undecided what to do next,
; so we figure it out here
(nil? char-type)
{:char-type (if (some #{c} "<>") :tag :text)
:start-idx idx
:coordinates coordinates}
; text ended, tag begins, which means we can
; record text node coordinates
(text-ended-and-tag-begins? c char-type)
{:char-type :tag
:start-idx idx
:coordinates (conj coordinates [start-idx (dec idx)])}
; text ended by HTML ending, record text node
; coordinates
(text-ended-by-html-ending? char-type total-char-count idx)
{:char-type nil
:start-idx idx
:coordinates (conj coordinates [start-idx idx])}
; otherwise don't record anything, just note
; the start of a tag
(tag-starts? c char-type)
{:char-type :tag
:start-idx idx
:coordinates coordinates}
; tag ended, record tag node coordinates
(= \> c)
{:char-type nil
:start-idx idx
:coordinates (conj coordinates [start-idx idx])}
:else state)))
(defn compose
"Composes a given `html` string into a vector of coordinates.
These are single-pass coordinates without awareness of context,
thus HTML such as:
```html
<div>hello</div>
```
will return 3 coordinates (div, text, div) instead of 2 (div, text).
To unify the coordinates in a context-aware way, you pass the result
of this function to the `unify` function."
[html]
(let [default-state {:char-type nil
:start-idx 0
:coordinates []}
indexed-html (map-indexed vector html)]
{:html html
:coordinates (-> (compose-reducer-fn (count indexed-html))
(reduce default-state indexed-html)
:coordinates)}))
(defn- coordinates->tag-name
"Parses the given `html` string between the indexes of `start`
and `end` for an HTML tag name.
```html
<div>hello</div>
```
Would become: `div`."
[html [start end]]
(let [value (subs html start end)]
(if (str/starts-with? value "<")
(->> (subs html start end)
(take-while #(not (contains? #{\space \>} %)))
(remove #(contains? #{\< \/} %))
(apply str))
value)))
(defn- name-coordinates-fn
"Returns a function with the initial state of an `html`
string, to be used to construct a sequence of `[index, name]`. "
[html]
(fn [idx coordinate]
[idx (coordinates->tag-name html coordinate)]))
(defn- last-by-tag-name-idx
"Gets the last coordinate matching the tag `name` that occurred
before `start`, for finding coordinates that should be merged
together."
[html coordinates name start]
(let [filter-fn (fn [[_ end]] (< end start))
filtered-coordinates (filter filter-fn coordinates)
index-fn (name-coordinates-fn html)
named-coordinates (map-indexed index-fn filtered-coordinates)]
(->> named-coordinates
(filter #(= name (-> % last)))
last
first)))
(defn- unify-one
[html coordinates [start end]]
(let [name (coordinates->tag-name html [start end])
matching-idx (last-by-tag-name-idx html coordinates name start)]
(if matching-idx
(let [[matching-start] (nth coordinates matching-idx)]
(assoc coordinates matching-idx [matching-start end]))
coordinates)))
(defn- unify-reducer-fn
"Returns a reducer function with the initial state of
a `html` string."
[html]
(fn [coordinates [start end]]
(if (and (= \< (nth html start))
(= \/ (nth html (inc start) nil)))
(unify-one html coordinates [start end])
(conj coordinates [start end]))))
(defn unify
"Joins together given `coordinates` that represent
one HTML node in `html`, without which `html` such as:
```html
<div>hello</div>
```
would result in 3 nodes (div, text, div), instead of 2 (div, text),
because non-unified coordinates are blind to the context
in which they live, having only had one pass over the
raw HTML string which composes the initial coordinates."
[{:keys [html coordinates]}]
{:html html
:coordinates (-> (unify-reducer-fn html)
(reduce [] coordinates))})
(defn- children
"Returns all the coordinates that belong between the given
`from` and `to` indexes."
[coordinates [from to]]
(->> coordinates
(filter (fn [[iter-from iter-to]]
(and (< from iter-from)
(> to iter-to))))
(sort-by first)))
(defn- without-children
"Returns all the coordinates that do not belong between
the given `parent-from` and `parent-to` indexes."
[coordinates [parent-from parent-to]]
(->> coordinates
(remove (fn [[from to]]
(or (= from parent-from)
(and (> from parent-from)
(< to parent-to)))))))
(defn- html-str->node-name
"Parses a given HTML string of a node to get its name as
a keyword. A text node will return `:dompa/text`."
[html]
(if (str/starts-with? html "<")
(->> (subs html 1)
(take-while #(not (contains? #{\space \>} %)))
(apply str)
keyword)
:dompa/text))
(defn- html-attr-str->k-v
"Parses a given HTML node attribute string into a
key-value pair."
[attr]
(->> (partition-by #(= % \=) attr)
(filter #(not= (-> % first) \=))
(map #(reduce str %))))
(defn- normalize-html-attr-str
"Normalizes a given HTML attribute string. If it
has surrounding quotes, removes them."
[html-attr-str]
(if (str/starts-with? html-attr-str "\"")
(->> (subs html-attr-str 1)
(take-while #(not= % \"))
(reduce str))
html-attr-str))
(defn- parse-html-attr-str
"Parses a given HTML attribute into a normalized
key-value map. Attributes with no value part are
treated as boolean attributes, and are always `true`."
[html-attr-str]
(let [[k v] (html-attr-str->k-v html-attr-str)
k (keyword k)
v (if (nil? v) true (normalize-html-attr-str v))]
{k v}))
(defn- html->str->node-attrs-reducer-fn
"Returns a reducer function with initial state of `attrs-html`."
[attrs-html]
(fn [{:keys [start-idx has-attrs? attrs] :as state} [idx c]]
(cond
; end of attrs-html, so lets collect whatever there is left
(= (count attrs-html) (inc idx))
{:start-idx 0
:has-attrs? has-attrs?
:attrs (conj attrs (subs attrs-html start-idx (inc idx)))}
; encountered a space, but there's no attrs, we're good
; to collect whatever there is.
(and (= \space c)
(not has-attrs?))
{:start-idx (inc idx)
:has-attrs? false
:attrs (conj attrs (subs attrs-html start-idx idx))}
; if we discover a = with quote after it,
; it means we have attrs
(and (= \= c)
(= \" (get attrs-html (inc idx))))
{:start-idx start-idx
:has-attrs? true
:attrs attrs}
; quote with either space next or nothing next,
; and no = before, and we have attrs
(and (= \" c)
(not (= \= (get attrs-html (dec idx))))
(or (nil? (get attrs-html (inc idx)))
(= \space (get attrs-html (inc idx))))
has-attrs?)
{:start-idx (inc idx)
:has-attrs? false
:attrs (conj attrs (subs attrs-html start-idx (inc idx)))}
:else state)))
(defn- html->attrs-html
"Transforms a given `html` string into a string portion of
just the attributes.
```html
<div class=\"test\"></div>
```
would become
```
class=\"test\"
```"
[html]
(->> (subs html 1)
(take-while #(not (contains? #{\> \/} %)))
(partition-by #(= % \space))
(drop 1)
flatten
(reduce str)
str/trim))
(defn- html-str->node-attrs
"Turns a given `html` string into an attribute map, e.g:
```html
<input type=\"checkbox\" checked />
```
Would become:
```clojure
{:type \"checkbox\"
:checked true}
```"
[html]
(when (str/starts-with? html "<")
(let [attrs-html (html->attrs-html html)
indexed-attrs-html (map-indexed vector attrs-html)
default-reducer-state {:start-idx 0
:has-attrs? false
:attrs []}]
(as-> (html->str->node-attrs-reducer-fn attrs-html) $
(reduce $ default-reducer-state indexed-attrs-html)
(remove str/blank? (:attrs $))
(map parse-html-attr-str $)
(into {} $)))))
(defn- construct-node
"Constructs a node map from `node-html` string and
its children nodes."
[node-html node-children]
(let [node-name (html-str->node-name node-html)
node-attrs (html-str->node-attrs node-html)]
(cond-> {:node/name node-name}
(= node-name :dompa/text) (assoc :node/value node-html)
(not (nil? node-attrs)) (assoc :node/attrs node-attrs)
(not (nil? node-children)) (assoc :node/children node-children))))
(defn ->nodes
"Transform given `html` according to given `coordinates` into
a tree of nodes, each representing one HTML node and its children.
Direct output of both `compose` and `unify` can be given to this
function, allowing chaining such as:
```clojure
(-> \"some html ...\"
coordinates/compose
coordinates/unify
coordinates/->nodes)
```"
[{:keys [html coordinates]}]
(when (seq coordinates)
(let [sorted-coordinates (sort-by first coordinates)
[parent-from parent-to] (first sorted-coordinates)
children (children sorted-coordinates [parent-from parent-to])
remaining (without-children sorted-coordinates [parent-from parent-to])
node-html (subs html parent-from (inc parent-to))
node-children (->nodes {:html html :coordinates children})]
(-> (cons (construct-node node-html node-children)
(->nodes {:html html :coordinates remaining}))
vec))))
|