diff options
| -rw-r--r-- | src/dompa/coordinates.clj | 100 | ||||
| -rw-r--r-- | src/dompa/coordinates.cljc | 202 | ||||
| -rw-r--r-- | src/dompa/core.clj | 52 | ||||
| -rw-r--r-- | src/dompa/html.cljc | 17 | ||||
| -rw-r--r-- | src/dompa/nodes.clj | 77 | ||||
| -rw-r--r-- | src/dompa/nodes.cljc | 59 | ||||
| -rw-r--r-- | src/dompa/utils.cljc | 1 | ||||
| -rw-r--r-- | test/dompa/coordinates_test.cljc | 14 | ||||
| -rw-r--r-- | test/dompa/html_test.cljc | 2 | ||||
| -rw-r--r-- | test/dompa/nodes_test.cljc | 2 | ||||
| -rw-r--r-- | test/dompa/utils_test.cljc | 2 |
11 files changed, 299 insertions, 229 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj deleted file mode 100644 index 2cd0b0a..0000000 --- a/src/dompa/coordinates.clj +++ /dev/null @@ -1,100 +0,0 @@ -(ns dompa.coordinates - (:require [clojure.string :as str])) - -(defn- construct-coordinates-reducer - [{:keys [char-type start-idx coordinates] :as state} [idx c]] - (cond - ; we're undecided what to do next, - ; so we figure it out here - (nil? char-type) - {:char-type (if (some #{c} "<>") :tag :text) - :start-idx idx - :coordinates coordinates} - - ; text ended, tag begins, which means we can - ; record text node coordinates - (and (= :text char-type) - (= \< c)) - {:char-type :tag - :start-idx idx - :coordinates (conj coordinates [start-idx (dec idx)])} - - ; otherwise don't record anything, just note - ; the start of a tag - (and (not= :text char-type) - (= \< c)) - {:char-type :tag - :start-idx idx - :coordinates coordinates} - - ; tag ended, record tag node coordinates - (= \> c) - {:char-type nil - :start-idx idx - :coordinates (conj coordinates [start-idx idx])} - - :else state)) - -(defn- construct-coordinates - [indexed-html] - (->> indexed-html - (reduce construct-coordinates-reducer {:char-type nil :start-idx 0 :coordinates []}) - :coordinates)) - -(defn coordinates->tag-name [html [from to]] - (-> (subs html from to) - (str/split #"[\s\>]") - first - (str/replace #"[\<\>\/]" ""))) - -(defn- name-coordinates-fn [html] - (fn [idx coordinate] - [idx (coordinates->tag-name html coordinate)])) - -(defn- last-coordinate-by-tag-name-idx [html coordinates name start] - (let [filter-fn (fn [[_ end]] (< end start)) - filtered-coordinates (filter filter-fn coordinates) - index-fn (name-coordinates-fn html) - named-coordinates (map-indexed index-fn filtered-coordinates)] - (->> named-coordinates - (filter #(= name (-> % last))) - last - first))) - -(defn- merge-coordinate [html coordinates [start end]] - (let [name (coordinates->tag-name html [start end]) - matching-idx (last-coordinate-by-tag-name-idx html coordinates name start) - [matching-start] (nth coordinates matching-idx)] - (assoc coordinates matching-idx [matching-start end]))) - -(defn- merge-coordinates-reducer-fn [html] - (fn [coordinates [start end]] - (if (and (= \< (nth html start)) - (= \/ (nth html (inc start) nil))) - (merge-coordinate html coordinates [start end]) - (conj coordinates [start end])))) - -(defn merge-coordinates [html coordinates] - (-> (merge-coordinates-reducer-fn html) - (reduce [] coordinates))) - -(defn children - [coordinates [from to]] - (->> coordinates - (filter (fn [[iter-from iter-to]] - (and (< from iter-from) - (> to iter-to)))) - (sort-by first))) - -(defn without-children - [coordinates [parent-from parent-to]] - (->> coordinates - (remove (fn [[from to]] - (or (= from parent-from) - (and (> from parent-from) - (< to parent-to))))))) - -(defn html->coordinates [html] - (->> (map-indexed vector html) - construct-coordinates - (merge-coordinates html))) diff --git a/src/dompa/coordinates.cljc b/src/dompa/coordinates.cljc new file mode 100644 index 0000000..327f184 --- /dev/null +++ b/src/dompa/coordinates.cljc @@ -0,0 +1,202 @@ +(ns dompa.coordinates + (:require [clojure.string :as str])) + +(defn- compose-reducer + [{:keys [char-type start-idx coordinates] :as state} [idx c]] + (cond + ; we're undecided what to do next, + ; so we figure it out here + (nil? char-type) + {:char-type (if (some #{c} "<>") :tag :text) + :start-idx idx + :coordinates coordinates} + + ; text ended, tag begins, which means we can + ; record text node coordinates + (and (= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates (conj coordinates [start-idx (dec idx)])} + + ; otherwise don't record anything, just note + ; the start of a tag + (and (not= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates coordinates} + + ; tag ended, record tag node coordinates + (= \> c) + {:char-type nil + :start-idx idx + :coordinates (conj coordinates [start-idx idx])} + + :else state)) + +(defn compose + "Composes a given `html` string into a vector of coordinates. + These are single-pass coordinates without awareness of context, + thus HTML such as: + + ```html + <div>hello</div> + ``` + + will return 3 coordinates (div, text, div) instead of 2 (div, text). + To unify the coordinates in a context-aware way, you pass the result + of this function to the `unify` function." + [html] + (let [default-state {:char-type nil + :start-idx 0 + :coordinates []} + indexed-html (map-indexed vector html)] + (-> compose-reducer + (reduce default-state indexed-html) + :coordinates))) + +(defn- coordinates->tag-name [html [start end]] + (let [value (subs html start end)] + (if (str/starts-with? value "<") + (->> (subs html start end) + (take-while #(not (contains? #{\space \>} %))) + (remove #(contains? #{\< \/} %)) + (apply str)) + value))) + +(defn- name-coordinates-fn [html] + (fn [idx coordinate] + [idx (coordinates->tag-name html coordinate)])) + +(defn- last-by-tag-name-idx [html coordinates name start] + (let [filter-fn (fn [[_ end]] (< end start)) + filtered-coordinates (filter filter-fn coordinates) + index-fn (name-coordinates-fn html) + named-coordinates (map-indexed index-fn filtered-coordinates)] + (->> named-coordinates + (filter #(= name (-> % last))) + last + first))) + +(defn- unify-one [html coordinates [start end]] + (let [name (coordinates->tag-name html [start end]) + matching-idx (last-by-tag-name-idx html coordinates name start)] + (if matching-idx + (let [[matching-start] (nth coordinates matching-idx)] + (assoc coordinates matching-idx [matching-start end])) + coordinates))) + +(defn- unify-reducer-fn [html] + (fn [coordinates [start end]] + (if (and (= \< (nth html start)) + (= \/ (nth html (inc start) nil))) + (unify-one html coordinates [start end]) + (conj coordinates [start end])))) + +(defn unify + "Joins together given `coordinates` that represent + one HTML node in `html`, without which `html` such as: + + ```html + <div>hello</div> + ``` + + would result in 3 nodes (div, text, div), instead of 2 (div, text), + because non-unified coordinates are blind to the context + in which they live, having only had one pass over the + raw HTML string which composes the initial coordinates." + [html coordinates] + (-> (unify-reducer-fn html) + (reduce [] coordinates))) + +(defn- children + [coordinates [from to]] + (->> coordinates + (filter (fn [[iter-from iter-to]] + (and (< from iter-from) + (> to iter-to)))) + (sort-by first))) + +(defn- without-children + [coordinates [parent-from parent-to]] + (->> coordinates + (remove (fn [[from to]] + (or (= from parent-from) + (and (> from parent-from) + (< to parent-to))))))) + +(defn- html-str->node-name + "Parses a given HTML string of a node to get its name as + a keyword. A text node will return `:dompa/text`." + [html] + (if (str/starts-with? html "<") + (->> (subs html 1) + (take-while #(not (contains? #{\space \>} %))) + (reduce str) + keyword) + :dompa/text)) + +(defn- html-attr-str->k-v + "Parses a given HTML node attribute string into a + key-value pair." + [attr] + (->> (partition-by #(= % \=) attr) + (filter #(not= (-> % first) \=)) + (map #(reduce str %)))) + +(defn- normalize-html-attr-str + "Normalizes a given HTML attribute string. If it + has surrounding quotes, removes them." + [html-attr-str] + (if (str/starts-with? html-attr-str "\"") + (->> (subs html-attr-str 1) + (take-while #(not= % \")) + (reduce str)) + html-attr-str)) + +(defn- parse-html-attr-str + "Parses a given HTML attribute into a normalized + key-value map. Attributes with no value part are + treated as boolean attributes, and are always `true`." + [html-attr-str] + (let [[k v] (html-attr-str->k-v html-attr-str) + k (keyword k) + v (if (nil? v) true (normalize-html-attr-str v))] + {k v})) + +(defn- html-str->node-attrs [html] + (when (str/starts-with? html "<") + (->> (subs html 1) + (take-while #(not (contains? #{\> \/} %))) + (partition-by #(= % \space)) + (drop 1) + (filter #(not= (-> % first) \space)) + (map parse-html-attr-str) + (into {})))) + +(defn- construct-node + [node-html node-children] + (let [node-name (html-str->node-name node-html)] + (merge + {:name node-name} + (when (= node-name :dompa/text) + {:value node-html}) + (when-let [attrs (html-str->node-attrs node-html)] + {:attrs attrs}) + (when node-children + {:children node-children})))) + +(defn ->nodes + "Transform given `html` according to given `coordinates` into + a tree of nodes, each representing one HTML node and its children." + [html coordinates] + (when (seq coordinates) + (let [sorted-coordinates (sort-by first coordinates) + [parent-from parent-to] (first sorted-coordinates) + children (children sorted-coordinates [parent-from parent-to]) + remaining (without-children sorted-coordinates [parent-from parent-to]) + node-html (subs html parent-from (inc parent-to)) + node-children (->nodes html children)] + (cons (construct-node node-html node-children) + (->nodes html remaining)))))
\ No newline at end of file diff --git a/src/dompa/core.clj b/src/dompa/core.clj deleted file mode 100644 index 4891c46..0000000 --- a/src/dompa/core.clj +++ /dev/null @@ -1,52 +0,0 @@ -(ns dompa.core - (:require - [dompa.coordinates :refer [html->coordinates]] - [dompa.nodes :refer [coordinates->nodes]])) - -(defn html->nodes [html] - (->> (html->coordinates html) - (coordinates->nodes html))) - -(def default-void-nodes - #{:img}) - -(defn- node->html - [{:keys [name content void-node?]}] - (if void-node? - (str "<" name ">") - (str "<" name ">" content "</" name ">"))) - -(defn nodes->html - ([nodes] - (nodes->html nodes {:void-nodes default-void-nodes})) - ([nodes {:keys [void-nodes]}] - (reduce - (fn [html node] - (if (= (-> node :name) :dompa/text) - (str html (-> node :value)) - (node->html {:name (-> node :name name) - :content (nodes->html (-> node :children)) - :void-node? (contains? void-nodes (-> node :name))}))) - "" - nodes))) - -(defn traverse-nodes [nodes pred] - (reduce - (fn [updated-nodes node] - (if-let [updated-node (pred node)] - (let [children (traverse-nodes (-> updated-node :children) pred)] - (conj updated-nodes (assoc updated-node :children children))) - updated-nodes)) - [] - nodes)) - -(defn traverse-html [html pred] - (-> (html->nodes html) - (traverse-nodes pred) - nodes->html)) - -(comment - (traverse-html "<div>asdasd<span>hello</span></div>" #(when-not (= (-> % :name) :span) - %)) - (html->coordinates "<div>hello<span>asd</span><strong>asdasdadad<img></strong></div>hello some text<div>another root element</div>") - (html->nodes "<div>hello<span><img src=\"test.jpg\" ckcche/>asd</span><strong>asdasdadad</strong>"))
\ No newline at end of file diff --git a/src/dompa/html.cljc b/src/dompa/html.cljc new file mode 100644 index 0000000..6268fa0 --- /dev/null +++ b/src/dompa/html.cljc @@ -0,0 +1,17 @@ +(ns dompa.html + (:require + [dompa.coordinates :as coordinates])) + +(defn ->coordinates + "Transform a `html` string into a vector of coordinates + indicating where an HTML node ends and begins." + [html] + (->> coordinates/compose + (coordinates/unify html))) + +(defn ->nodes + "Transform a `html` string into a tree of nodes, + each representing one HTML node and its children." + [html] + (->> (->coordinates html) + (coordinates/->nodes html)))
\ No newline at end of file diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj deleted file mode 100644 index 82734a3..0000000 --- a/src/dompa/nodes.clj +++ /dev/null @@ -1,77 +0,0 @@ -(ns dompa.nodes - (:require - [clojure.string :as str] - [dompa.coordinates :as coordinates])) - -(defn- html-str->node-name - "Parses a given HTML string of a node to get its name as - a keyword. A text node will return `:dompa/text`." - [html] - (if (str/starts-with? html "<") - (->> (subs html 1) - (take-while #(not (contains? #{\space \>} %))) - (reduce str) - keyword) - :dompa/text)) - -(defn- html-attr-str->k-v - "Parses a given HTML node attribute string into a - key-value pair." - [attr] - (->> (partition-by #(= % \=) attr) - (filter #(not= (-> % first) \=)) - (map #(reduce str %)))) - -(defn- normalize-html-attr-str - "Normalizes a given HTML attribute string. If it - has surrounding quotes, removes them." - [html-attr-str] - (if (str/starts-with? html-attr-str "\"") - (->> (subs html-attr-str 1) - (take-while #(not= % \")) - (reduce str)) - html-attr-str)) - -(defn- parse-html-attr-str - "Parses a given HTML attribute into a normalized - key-value map. Attributes with no value part are - treated as boolean attributes, and are always `true`." - [html-attr-str] - (let [[k v] (html-attr-str->k-v html-attr-str) - k (keyword k) - v (if (nil? v) true (normalize-html-attr-str v))] - {k v})) - -(defn- html-str->node-attrs [html] - (when (str/starts-with? html "<") - (->> (subs html 1) - (take-while #(not (contains? #{\> \/} %))) - (partition-by #(= % \space)) - (drop 1) - (filter #(not= (-> % first) \space)) - (map parse-html-attr-str) - (into {})))) - -(defn- construct-node - [node-html node-children] - (let [node-name (html-str->node-name node-html)] - (merge - {:name node-name} - (when (= node-name :dompa/text) - {:value node-html}) - (when-let [attrs (html-str->node-attrs node-html)] - {:attrs attrs}) - (when node-children - {:children node-children})))) - -(defn coordinates->nodes - [html coordinates] - (when (seq coordinates) - (let [sorted-coordinates (sort-by first coordinates) - [parent-from parent-to] (first sorted-coordinates) - children (coordinates/children sorted-coordinates [parent-from parent-to]) - remaining (coordinates/without-children sorted-coordinates [parent-from parent-to]) - node-html (subs html parent-from (inc parent-to)) - node-children (coordinates->nodes html children)] - (cons (construct-node node-html node-children) - (coordinates->nodes html remaining))))) diff --git a/src/dompa/nodes.cljc b/src/dompa/nodes.cljc new file mode 100644 index 0000000..d035116 --- /dev/null +++ b/src/dompa/nodes.cljc @@ -0,0 +1,59 @@ +(ns dompa.nodes) + +(def ^:private default-void-nodes + #{:!doctype :area :base :br :col :embed :hr :img :input + :link :meta :source :track :wbr}) + +(defn- node->html-reducer-fn + [void-nodes nodes->html-fn] + (fn [html node] + (cond + (= (-> node :name) :dompa/text) + (str html (-> node :value)) + + (contains? void-nodes (-> node :name)) + (str "<" (-> node :name) ">") + + :else + (let [value (nodes->html-fn (-> node :children))] + (str "<" (-> node :name) ">" value "</" (-> node :name) ">"))))) + +(defn traverse + "Recursively traverses given tree of `nodes` with a `traverser-fn` + that gets a single node passed to it and returns a new updated tree. + If the traverses function returns `nil`, the node will be removed. + In any other case the node will be replaced. If you wish to keep + a node unchanged, just return it as-is." + [nodes traverser-fn] + (-> (fn [updated-nodes node] + (if-let [updated-node (traverser-fn node)] + (let [children (traverse (-> updated-node :children) traverser-fn)] + (conj updated-nodes (assoc updated-node :children children))) + updated-nodes)) + (reduce [] nodes))) + +(defn ->html + "Transform a vector of `nodes` into an HTML string. + + Options: + - `void-nodes` - A set of node names that are self-closing, defaults to: + - `:!doctype` + - `:area` + - `:base` + - `:br` + - `:col` + - `:embed` + - `:hr` + - `:img` + - `:input` + - `:link` + - `:meta` + - `:source` + - `:track` + - `:wbr` + " + ([nodes] + (->html nodes {:void-nodes default-void-nodes})) + ([nodes {:keys [void-nodes]}] + (-> (node->html-reducer-fn void-nodes ->html) + (reduce "" nodes))))
\ No newline at end of file diff --git a/src/dompa/utils.cljc b/src/dompa/utils.cljc new file mode 100644 index 0000000..c25e841 --- /dev/null +++ b/src/dompa/utils.cljc @@ -0,0 +1 @@ +(ns dompa.utils) diff --git a/test/dompa/coordinates_test.cljc b/test/dompa/coordinates_test.cljc new file mode 100644 index 0000000..efa6df5 --- /dev/null +++ b/test/dompa/coordinates_test.cljc @@ -0,0 +1,14 @@ +(ns dompa.coordinates-test + (:require [clojure.test :refer [deftest testing is]] + [dompa.coordinates :as coordinates])) + +(deftest compose-test + (testing "Create first-pass coordinates" + (is (= [[0 4] [5 9] [10 15]] (coordinates/compose "<div>hello</div>")))) + + (testing "Create first-pass coordinates with invalid HTML" + (is (= [[0 4]] (coordinates/compose "<div>hello"))) + (is (= [] (coordinates/compose "<div")))) + + (testing "Create first-pass coordinates with just text" + (is (= [[0 4]] (coordinates/compose "hello")))))
\ No newline at end of file diff --git a/test/dompa/html_test.cljc b/test/dompa/html_test.cljc new file mode 100644 index 0000000..ab2fafd --- /dev/null +++ b/test/dompa/html_test.cljc @@ -0,0 +1,2 @@ +(ns dompa.html-test + (:require [clojure.test :refer [deftest is testing]])) diff --git a/test/dompa/nodes_test.cljc b/test/dompa/nodes_test.cljc new file mode 100644 index 0000000..55f9394 --- /dev/null +++ b/test/dompa/nodes_test.cljc @@ -0,0 +1,2 @@ +(ns dompa.nodes-test + (:require [clojure.test :refer [deftest is testing]])) diff --git a/test/dompa/utils_test.cljc b/test/dompa/utils_test.cljc new file mode 100644 index 0000000..9d0ddf3 --- /dev/null +++ b/test/dompa/utils_test.cljc @@ -0,0 +1,2 @@ +(ns dompa.utils-test + (:require [clojure.test :refer [deftest is testing]])) |
