From 9f31cc6a5aae83f7986e733603e62e91eae31196 Mon Sep 17 00:00:00 2001 From: Asko Nõmm Date: Sat, 20 Sep 2025 19:58:00 +0300 Subject: Many fixes to coordinates logic in capturing attributes, and more tests. Coordinates ns is shaping up well. --- src/dompa/coordinates.cljc | 130 +++++++++++++++++++++++++++++++-------- test/dompa/coordinates_test.cljc | 118 ++++++++++++++++++++++++++++++----- 2 files changed, 209 insertions(+), 39 deletions(-) diff --git a/src/dompa/coordinates.cljc b/src/dompa/coordinates.cljc index 8610b8b..3584dc9 100644 --- a/src/dompa/coordinates.cljc +++ b/src/dompa/coordinates.cljc @@ -61,11 +61,13 @@ :start-idx 0 :coordinates []} indexed-html (map-indexed vector html)] - (-> (compose-reducer-fn (count indexed-html)) - (reduce default-state indexed-html) - :coordinates))) + {:html html + :coordinates (-> (compose-reducer-fn (count indexed-html)) + (reduce default-state indexed-html) + :coordinates)})) -(defn- coordinates->tag-name [html [start end]] +(defn- coordinates->tag-name + [html [start end]] (let [value (subs html start end)] (if (str/starts-with? value "<") (->> (subs html start end) @@ -74,11 +76,13 @@ (apply str)) value))) -(defn- name-coordinates-fn [html] +(defn- name-coordinates-fn + [html] (fn [idx coordinate] [idx (coordinates->tag-name html coordinate)])) -(defn- last-by-tag-name-idx [html coordinates name start] +(defn- last-by-tag-name-idx + [html coordinates name start] (let [filter-fn (fn [[_ end]] (< end start)) filtered-coordinates (filter filter-fn coordinates) index-fn (name-coordinates-fn html) @@ -88,7 +92,8 @@ last first))) -(defn- unify-one [html coordinates [start end]] +(defn- unify-one + [html coordinates [start end]] (let [name (coordinates->tag-name html [start end]) matching-idx (last-by-tag-name-idx html coordinates name start)] (if matching-idx @@ -96,7 +101,8 @@ (assoc coordinates matching-idx [matching-start end])) coordinates))) -(defn- unify-reducer-fn [html] +(defn- unify-reducer-fn + [html] (fn [coordinates [start end]] (if (and (= \< (nth html start)) (= \/ (nth html (inc start) nil))) @@ -115,9 +121,10 @@ because non-unified coordinates are blind to the context in which they live, having only had one pass over the raw HTML string which composes the initial coordinates." - [html coordinates] - (-> (unify-reducer-fn html) - (reduce [] coordinates))) + [{:keys [html coordinates]}] + {:html html + :coordinates (-> (unify-reducer-fn html) + (reduce [] coordinates))}) (defn- children [coordinates [from to]] @@ -174,15 +181,79 @@ v (if (nil? v) true (normalize-html-attr-str v))] {k v})) -(defn- html-str->node-attrs [html] +(defn- html->str->node-attrs-reducer + [attrs-html] + (fn [{:keys [start-idx has-attrs? attrs] :as state} [idx c]] + (cond + ; end of attrs-html, so lets collect whatever there is left + (= (count attrs-html) (inc idx)) + {:start-idx 0 + :has-attrs? has-attrs? + :attrs (conj attrs (subs attrs-html start-idx (inc idx)))} + + ; encountered a space, but there's no attrs, we're good + ; to collect whatever there is. + (and (= \space c) + (not has-attrs?)) + {:start-idx (inc idx) + :has-attrs? false + :attrs (conj attrs (subs attrs-html start-idx idx))} + + ; if we discover a = with quote after it, + ; it means we have attrs + (and (= \= c) + (= \" (get attrs-html (inc idx)))) + {:start-idx start-idx + :has-attrs? true + :attrs attrs} + + ; quote with either space next or nothing next, + ; and no = before, and we have attrs + (and (= \" c) + (not (= \= (get attrs-html (dec idx)))) + (or (nil? (get attrs-html (inc idx))) + (= \space (get attrs-html (inc idx)))) + has-attrs?) + {:start-idx (inc idx) + :has-attrs? false + :attrs (conj attrs (subs attrs-html start-idx (inc idx)))} + + :else state))) + +(defn- html->str->attrs-html-str [html] + (->> (subs html 1) + (take-while #(not (contains? #{\> \/} %))) + (partition-by #(= % \space)) + (drop 1) + flatten + (reduce str) + str/trim)) + +(defn- html-str->node-attrs + "Turns a given `html` string into an attribute map, e.g: + + ```html + + ``` + + Would become: + + ```clojure + {:type \"checkbox\" + :checked true} + ```" + [html] (when (str/starts-with? html "<") - (->> (subs html 1) - (take-while #(not (contains? #{\> \/} %))) - (partition-by #(= % \space)) - (drop 1) - (filter #(not= (-> % first) \space)) - (map parse-html-attr-str) - (into {})))) + (let [attrs-html (html->str->attrs-html-str html) + indexed-attrs-html (map-indexed vector attrs-html) + default-reducer-state {:start-idx 0 + :has-attrs? false + :attrs []}] + (as-> (html->str->node-attrs-reducer attrs-html) $ + (reduce $ default-reducer-state indexed-attrs-html) + (remove str/blank? (:attrs $)) + (map parse-html-attr-str $) + (into {} $))))) (defn- construct-node [node-html node-children] @@ -198,14 +269,25 @@ (defn ->nodes "Transform given `html` according to given `coordinates` into - a tree of nodes, each representing one HTML node and its children." - [html coordinates] + a tree of nodes, each representing one HTML node and its children. + + Direct output of both `compose` and `unify` can be given to this + function, allowing chaining such as: + + ```clojure + (-> \"some html ...\" + coordinates/compose + coordinates/unify + coordinates/->nodes) + ```" + [{:keys [html coordinates]}] (when (seq coordinates) (let [sorted-coordinates (sort-by first coordinates) [parent-from parent-to] (first sorted-coordinates) children (children sorted-coordinates [parent-from parent-to]) remaining (without-children sorted-coordinates [parent-from parent-to]) node-html (subs html parent-from (inc parent-to)) - node-children (->nodes html children)] - (cons (construct-node node-html node-children) - (->nodes html remaining))))) \ No newline at end of file + node-children (->nodes {:html html :coordinates children})] + (-> (cons (construct-node node-html node-children) + (->nodes {:html html :coordinates remaining})) + vec)))) \ No newline at end of file diff --git a/test/dompa/coordinates_test.cljc b/test/dompa/coordinates_test.cljc index a827968..eb118d2 100644 --- a/test/dompa/coordinates_test.cljc +++ b/test/dompa/coordinates_test.cljc @@ -4,28 +4,116 @@ (deftest compose-test (testing "Create coordinates" - (is (= [[0 4] [5 9] [10 15]] - (coordinates/compose "
hello
")))) + (let [html "
hello
"] + (is (= {:html html + :coordinates [[0 4] [5 9] [10 15]]} + (coordinates/compose html))))) (testing "Create coordinates with invalid HTML" - (is (= [[0 4] [5 9]] - (coordinates/compose "
hello"))) + (let [html "
hello"] + (is (= {:html html + :coordinates [[0 4] [5 9]]} + (coordinates/compose html)))) - (is (= [] - (coordinates/compose "hello"] + (is (= {:html html + :coordinates [[0 4] [5 9] [10 16]]} + (coordinates/compose html)))) - (is (= [[0 3]] - (coordinates/compose "div>"))) + (let [html "")))) + (let [html "div>"] + (is (= {:html html + :coordinates [[0 3]]} + (coordinates/compose html)))) + + (let [html "<>"] + (is (= {:html html + :coordinates [[0 1]]} + (coordinates/compose html))))) (testing "Create coordinates with just text" - (is (= [[0 4]] - (coordinates/compose "hello")))) + (let [html "hello"] + (is (= {:html html + :coordinates [[0 4]]} + (coordinates/compose html))))) (testing "Create coordinates with text starting" - (is (= [[0 4] [5 9] [10 15]] - (coordinates/compose "hello
"))))) + (let [html "hello
"] + (is (= {:html html + :coordinates [[0 4] [5 9] [10 15]]} + (coordinates/compose html))))) + + (testing "Create coordinates with text ending" + (let [html "
hello"] + (is (= {:html html + :coordinates [[0 4] [5 10] [11 15]]} + (coordinates/compose html)))))) + +(deftest unify-test + (testing "Unify coordinates" + (let [html "
hello
"] + (is (= {:html html + :coordinates [[0 15] [5 9]]} + (-> (coordinates/compose html) + coordinates/unify))))) + + (testing "Unify coordinates with invalid HTML" + (let [html "
hello"] + (is (= {:html html + :coordinates [[0 4] [5 9]]} + (-> (coordinates/compose html) + coordinates/unify)))) + + (let [html "
hello"] + (is (= {:html html + :coordinates [[0 4] [5 9]]} + (-> (coordinates/compose html) + coordinates/unify))))) + + (testing "Unify coordinates with just text" + (let [html "hello"] + (is (= {:html html + :coordinates [[0 4]]} + (-> (coordinates/compose html) + coordinates/unify))))) + + (testing "Unify coordinates with text starting" + (let [html "hello
"] + (is (= {:html html + :coordinates [[0 4] [5 15]]} + (-> (coordinates/compose html) + coordinates/unify))))) + + (testing "Unify coordinates with text ending" + (let [html "
hello"] + (is (= {:html html + :coordinates [[0 10] [11 15]]} + (-> (coordinates/compose html) + coordinates/unify)))))) + +(deftest nodes-test + (testing "Create nodes" + (is (= [{:name :div + :attrs {} + :children [{:name :dompa/text + :value "hello"}]}] + (-> "
hello
" + coordinates/compose + coordinates/unify + coordinates/->nodes)))) -(deftest unify-test) \ No newline at end of file + (testing "Create nodes with attributes" + (is (= [{:attrs {:class "some test classes" + :data-attr "something" + :checked true} + :children [{:name :dompa/text + :value "hello"}] + :name :div}] + (-> "
hello
" + coordinates/compose + coordinates/unify + coordinates/->nodes))))) \ No newline at end of file -- cgit v1.2.3