summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAsko Nõmm <asko@nmm.ee>2025-10-19 23:17:34 +0300
committerAsko Nõmm <asko@nmm.ee>2025-10-19 23:17:34 +0300
commit79159a47202cda8bdfa74e3e594f69ab1f6c9e2a (patch)
tree29ab638914f1493f122f41d2c14da7b4b7286745 /src
parent23542524c4b545c2489ec1a429f810cbfcd003cd (diff)
#5: Fixen an issue where same-name children of a root would cause the nodes to close too early.
Diffstat (limited to 'src')
-rw-r--r--src/dompa/coordinates.cljc118
-rw-r--r--src/dompa/nodes.cljc2
2 files changed, 67 insertions, 53 deletions
diff --git a/src/dompa/coordinates.cljc b/src/dompa/coordinates.cljc
index e5ca5e8..2388fcd 100644
--- a/src/dompa/coordinates.cljc
+++ b/src/dompa/coordinates.cljc
@@ -40,6 +40,13 @@
:start-idx idx
:coordinates (conj coordinates [start-idx idx])}
+ ; new tag starts while we were parsing another tag,
+ ; handles void elements
+ (and (= :tag char-type) (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx (dec idx)])}
+
; otherwise don't record anything, just note
; the start of a tag
(tag-starts? c char-type)
@@ -95,62 +102,69 @@
(apply str))
value)))
-(defn- name-coordinates-fn
- "Returns a function with the initial state of an `html`
- string, to be used to construct a sequence of `[index, name]`. "
- [html]
- (fn [idx coordinate]
- [idx (coordinates->tag-name html coordinate)]))
-
-(defn- last-by-tag-name-idx
- "Gets the last coordinate matching the tag `name` that occurred
- before `start`, for finding coordinates that should be merged
- together."
- [html coordinates name start]
- (let [filter-fn (fn [[_ end]] (< end start))
- filtered-coordinates (filter filter-fn coordinates)
- index-fn (name-coordinates-fn html)
- named-coordinates (map-indexed index-fn filtered-coordinates)]
- (->> named-coordinates
- (filter #(= name (-> % last)))
- last
- first)))
-
-(defn- unify-one
- [html coordinates [start end]]
- (let [name (coordinates->tag-name html [start end])
- matching-idx (last-by-tag-name-idx html coordinates name start)]
- (if matching-idx
- (let [[matching-start] (nth coordinates matching-idx)]
- (assoc coordinates matching-idx [matching-start end]))
- coordinates)))
-
-(defn- unify-reducer-fn
- "Returns a reducer function with the initial state of
- a `html` string."
- [html]
- (fn [coordinates [start end]]
- (if (and (= \< (nth html start))
- (= \/ (nth html (inc start) nil)))
- (unify-one html coordinates [start end])
- (conj coordinates [start end]))))
+(defn- coordinate-info
+ "Determines if a coordinate is an opening tag, closing tag, or text."
+ [html [start end]]
+ (let [value (subs html start (inc end))]
+ (cond
+ (str/starts-with? value "</")
+ {:coord-type :closing, :coord-name (coordinates->tag-name html [start end])}
+
+ (str/starts-with? value "<")
+ {:coord-type :opening, :coord-name (coordinates->tag-name html [start end])}
+
+ :else
+ {:coord-type :text, :coord-name :dompa/text})))
+
+(def ^:private void-elements
+ #{"area" "base" "br" "col" "embed" "hr" "img"
+ "input" "link" "meta" "param" "source" "track" "wbr"})
+
+(defn- handle-opening-tag [{:keys [stack unified coord coord-name start]}]
+ (if (void-elements coord-name)
+ {:stack stack
+ :unified (conj unified coord)}
+ {:stack (conj stack {:name coord-name :start start})
+ :unified unified}))
+
+(defn- handle-closing-tag [{:keys [stack unified coord-name end]}]
+ (if-let [last-open (peek stack)]
+ (if (= coord-name (:name last-open))
+ {:stack (pop stack)
+ :unified (conj unified [(:start last-open) end])}
+ {:stack stack :unified unified})
+ {:stack stack :unified unified}))
+
+(defn- unify-reducer-fn [html]
+ (fn [{:keys [stack unified]} [start end :as coord]]
+ (let [{:keys [coord-type coord-name]} (coordinate-info html coord)]
+ (cond
+ (= coord-type :opening)
+ (handle-opening-tag {:stack stack
+ :unified unified
+ :coord coord
+ :coord-name coord-name
+ :start start})
+
+ (= coord-type :closing)
+ (handle-closing-tag {:stack stack
+ :unified unified
+ :coord-name coord-name
+ :end end})
+
+ :else
+ {:stack stack
+ :unified (conj unified coord)}))))
(defn unify
"Joins together given `coordinates` that represent
- one HTML node in `html`, without which `html` such as:
-
- ```html
- <div>hello</div>
- ```
-
- would result in 3 nodes (div, text, div), instead of 2 (div, text),
- because non-unified coordinates are blind to the context
- in which they live, having only had one pass over the
- raw HTML string which composes the initial coordinates."
+ one HTML node in `html`, using a stack-based approach to correctly
+ handle nested and void tags."
[{:keys [html coordinates]}]
- {:html html
- :coordinates (-> (unify-reducer-fn html)
- (reduce [] coordinates))})
+ (let [initial-state {:stack [], :unified []}
+ result (reduce (unify-reducer-fn html) initial-state coordinates)]
+ {:html html
+ :coordinates (sort-by first (:unified result))}))
(defn- children
"Returns all the coordinates that belong between the given
diff --git a/src/dompa/nodes.cljc b/src/dompa/nodes.cljc
index 8600026..9dbecd0 100644
--- a/src/dompa/nodes.cljc
+++ b/src/dompa/nodes.cljc
@@ -1,7 +1,7 @@
(ns dompa.nodes)
(def ^:private default-void-nodes
- #{:!doctype :area :base :br :col :embed :hr :img :input
+ #{:!doctype :!DOCTYPE :area :base :br :col :embed :hr :img :input
:link :meta :source :track :wbr})
(defn- node-attrs-reducer [attrs k v]