summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dompa/coordinates.clj100
-rw-r--r--src/dompa/coordinates.cljc202
-rw-r--r--src/dompa/core.clj52
-rw-r--r--src/dompa/html.cljc17
-rw-r--r--src/dompa/nodes.clj77
-rw-r--r--src/dompa/nodes.cljc59
-rw-r--r--src/dompa/utils.cljc1
7 files changed, 279 insertions, 229 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj
deleted file mode 100644
index 2cd0b0a..0000000
--- a/src/dompa/coordinates.clj
+++ /dev/null
@@ -1,100 +0,0 @@
-(ns dompa.coordinates
- (:require [clojure.string :as str]))
-
-(defn- construct-coordinates-reducer
- [{:keys [char-type start-idx coordinates] :as state} [idx c]]
- (cond
- ; we're undecided what to do next,
- ; so we figure it out here
- (nil? char-type)
- {:char-type (if (some #{c} "<>") :tag :text)
- :start-idx idx
- :coordinates coordinates}
-
- ; text ended, tag begins, which means we can
- ; record text node coordinates
- (and (= :text char-type)
- (= \< c))
- {:char-type :tag
- :start-idx idx
- :coordinates (conj coordinates [start-idx (dec idx)])}
-
- ; otherwise don't record anything, just note
- ; the start of a tag
- (and (not= :text char-type)
- (= \< c))
- {:char-type :tag
- :start-idx idx
- :coordinates coordinates}
-
- ; tag ended, record tag node coordinates
- (= \> c)
- {:char-type nil
- :start-idx idx
- :coordinates (conj coordinates [start-idx idx])}
-
- :else state))
-
-(defn- construct-coordinates
- [indexed-html]
- (->> indexed-html
- (reduce construct-coordinates-reducer {:char-type nil :start-idx 0 :coordinates []})
- :coordinates))
-
-(defn coordinates->tag-name [html [from to]]
- (-> (subs html from to)
- (str/split #"[\s\>]")
- first
- (str/replace #"[\<\>\/]" "")))
-
-(defn- name-coordinates-fn [html]
- (fn [idx coordinate]
- [idx (coordinates->tag-name html coordinate)]))
-
-(defn- last-coordinate-by-tag-name-idx [html coordinates name start]
- (let [filter-fn (fn [[_ end]] (< end start))
- filtered-coordinates (filter filter-fn coordinates)
- index-fn (name-coordinates-fn html)
- named-coordinates (map-indexed index-fn filtered-coordinates)]
- (->> named-coordinates
- (filter #(= name (-> % last)))
- last
- first)))
-
-(defn- merge-coordinate [html coordinates [start end]]
- (let [name (coordinates->tag-name html [start end])
- matching-idx (last-coordinate-by-tag-name-idx html coordinates name start)
- [matching-start] (nth coordinates matching-idx)]
- (assoc coordinates matching-idx [matching-start end])))
-
-(defn- merge-coordinates-reducer-fn [html]
- (fn [coordinates [start end]]
- (if (and (= \< (nth html start))
- (= \/ (nth html (inc start) nil)))
- (merge-coordinate html coordinates [start end])
- (conj coordinates [start end]))))
-
-(defn merge-coordinates [html coordinates]
- (-> (merge-coordinates-reducer-fn html)
- (reduce [] coordinates)))
-
-(defn children
- [coordinates [from to]]
- (->> coordinates
- (filter (fn [[iter-from iter-to]]
- (and (< from iter-from)
- (> to iter-to))))
- (sort-by first)))
-
-(defn without-children
- [coordinates [parent-from parent-to]]
- (->> coordinates
- (remove (fn [[from to]]
- (or (= from parent-from)
- (and (> from parent-from)
- (< to parent-to)))))))
-
-(defn html->coordinates [html]
- (->> (map-indexed vector html)
- construct-coordinates
- (merge-coordinates html)))
diff --git a/src/dompa/coordinates.cljc b/src/dompa/coordinates.cljc
new file mode 100644
index 0000000..327f184
--- /dev/null
+++ b/src/dompa/coordinates.cljc
@@ -0,0 +1,202 @@
+(ns dompa.coordinates
+ (:require [clojure.string :as str]))
+
+(defn- compose-reducer
+ [{:keys [char-type start-idx coordinates] :as state} [idx c]]
+ (cond
+ ; we're undecided what to do next,
+ ; so we figure it out here
+ (nil? char-type)
+ {:char-type (if (some #{c} "<>") :tag :text)
+ :start-idx idx
+ :coordinates coordinates}
+
+ ; text ended, tag begins, which means we can
+ ; record text node coordinates
+ (and (= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx (dec idx)])}
+
+ ; otherwise don't record anything, just note
+ ; the start of a tag
+ (and (not= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates coordinates}
+
+ ; tag ended, record tag node coordinates
+ (= \> c)
+ {:char-type nil
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx idx])}
+
+ :else state))
+
+(defn compose
+ "Composes a given `html` string into a vector of coordinates.
+ These are single-pass coordinates without awareness of context,
+ thus HTML such as:
+
+ ```html
+ <div>hello</div>
+ ```
+
+ will return 3 coordinates (div, text, div) instead of 2 (div, text).
+ To unify the coordinates in a context-aware way, you pass the result
+ of this function to the `unify` function."
+ [html]
+ (let [default-state {:char-type nil
+ :start-idx 0
+ :coordinates []}
+ indexed-html (map-indexed vector html)]
+ (-> compose-reducer
+ (reduce default-state indexed-html)
+ :coordinates)))
+
+(defn- coordinates->tag-name [html [start end]]
+ (let [value (subs html start end)]
+ (if (str/starts-with? value "<")
+ (->> (subs html start end)
+ (take-while #(not (contains? #{\space \>} %)))
+ (remove #(contains? #{\< \/} %))
+ (apply str))
+ value)))
+
+(defn- name-coordinates-fn [html]
+ (fn [idx coordinate]
+ [idx (coordinates->tag-name html coordinate)]))
+
+(defn- last-by-tag-name-idx [html coordinates name start]
+ (let [filter-fn (fn [[_ end]] (< end start))
+ filtered-coordinates (filter filter-fn coordinates)
+ index-fn (name-coordinates-fn html)
+ named-coordinates (map-indexed index-fn filtered-coordinates)]
+ (->> named-coordinates
+ (filter #(= name (-> % last)))
+ last
+ first)))
+
+(defn- unify-one [html coordinates [start end]]
+ (let [name (coordinates->tag-name html [start end])
+ matching-idx (last-by-tag-name-idx html coordinates name start)]
+ (if matching-idx
+ (let [[matching-start] (nth coordinates matching-idx)]
+ (assoc coordinates matching-idx [matching-start end]))
+ coordinates)))
+
+(defn- unify-reducer-fn [html]
+ (fn [coordinates [start end]]
+ (if (and (= \< (nth html start))
+ (= \/ (nth html (inc start) nil)))
+ (unify-one html coordinates [start end])
+ (conj coordinates [start end]))))
+
+(defn unify
+ "Joins together given `coordinates` that represent
+ one HTML node in `html`, without which `html` such as:
+
+ ```html
+ <div>hello</div>
+ ```
+
+ would result in 3 nodes (div, text, div), instead of 2 (div, text),
+ because non-unified coordinates are blind to the context
+ in which they live, having only had one pass over the
+ raw HTML string which composes the initial coordinates."
+ [html coordinates]
+ (-> (unify-reducer-fn html)
+ (reduce [] coordinates)))
+
+(defn- children
+ [coordinates [from to]]
+ (->> coordinates
+ (filter (fn [[iter-from iter-to]]
+ (and (< from iter-from)
+ (> to iter-to))))
+ (sort-by first)))
+
+(defn- without-children
+ [coordinates [parent-from parent-to]]
+ (->> coordinates
+ (remove (fn [[from to]]
+ (or (= from parent-from)
+ (and (> from parent-from)
+ (< to parent-to)))))))
+
+(defn- html-str->node-name
+ "Parses a given HTML string of a node to get its name as
+ a keyword. A text node will return `:dompa/text`."
+ [html]
+ (if (str/starts-with? html "<")
+ (->> (subs html 1)
+ (take-while #(not (contains? #{\space \>} %)))
+ (reduce str)
+ keyword)
+ :dompa/text))
+
+(defn- html-attr-str->k-v
+ "Parses a given HTML node attribute string into a
+ key-value pair."
+ [attr]
+ (->> (partition-by #(= % \=) attr)
+ (filter #(not= (-> % first) \=))
+ (map #(reduce str %))))
+
+(defn- normalize-html-attr-str
+ "Normalizes a given HTML attribute string. If it
+ has surrounding quotes, removes them."
+ [html-attr-str]
+ (if (str/starts-with? html-attr-str "\"")
+ (->> (subs html-attr-str 1)
+ (take-while #(not= % \"))
+ (reduce str))
+ html-attr-str))
+
+(defn- parse-html-attr-str
+ "Parses a given HTML attribute into a normalized
+ key-value map. Attributes with no value part are
+ treated as boolean attributes, and are always `true`."
+ [html-attr-str]
+ (let [[k v] (html-attr-str->k-v html-attr-str)
+ k (keyword k)
+ v (if (nil? v) true (normalize-html-attr-str v))]
+ {k v}))
+
+(defn- html-str->node-attrs [html]
+ (when (str/starts-with? html "<")
+ (->> (subs html 1)
+ (take-while #(not (contains? #{\> \/} %)))
+ (partition-by #(= % \space))
+ (drop 1)
+ (filter #(not= (-> % first) \space))
+ (map parse-html-attr-str)
+ (into {}))))
+
+(defn- construct-node
+ [node-html node-children]
+ (let [node-name (html-str->node-name node-html)]
+ (merge
+ {:name node-name}
+ (when (= node-name :dompa/text)
+ {:value node-html})
+ (when-let [attrs (html-str->node-attrs node-html)]
+ {:attrs attrs})
+ (when node-children
+ {:children node-children}))))
+
+(defn ->nodes
+ "Transform given `html` according to given `coordinates` into
+ a tree of nodes, each representing one HTML node and its children."
+ [html coordinates]
+ (when (seq coordinates)
+ (let [sorted-coordinates (sort-by first coordinates)
+ [parent-from parent-to] (first sorted-coordinates)
+ children (children sorted-coordinates [parent-from parent-to])
+ remaining (without-children sorted-coordinates [parent-from parent-to])
+ node-html (subs html parent-from (inc parent-to))
+ node-children (->nodes html children)]
+ (cons (construct-node node-html node-children)
+ (->nodes html remaining))))) \ No newline at end of file
diff --git a/src/dompa/core.clj b/src/dompa/core.clj
deleted file mode 100644
index 4891c46..0000000
--- a/src/dompa/core.clj
+++ /dev/null
@@ -1,52 +0,0 @@
-(ns dompa.core
- (:require
- [dompa.coordinates :refer [html->coordinates]]
- [dompa.nodes :refer [coordinates->nodes]]))
-
-(defn html->nodes [html]
- (->> (html->coordinates html)
- (coordinates->nodes html)))
-
-(def default-void-nodes
- #{:img})
-
-(defn- node->html
- [{:keys [name content void-node?]}]
- (if void-node?
- (str "<" name ">")
- (str "<" name ">" content "</" name ">")))
-
-(defn nodes->html
- ([nodes]
- (nodes->html nodes {:void-nodes default-void-nodes}))
- ([nodes {:keys [void-nodes]}]
- (reduce
- (fn [html node]
- (if (= (-> node :name) :dompa/text)
- (str html (-> node :value))
- (node->html {:name (-> node :name name)
- :content (nodes->html (-> node :children))
- :void-node? (contains? void-nodes (-> node :name))})))
- ""
- nodes)))
-
-(defn traverse-nodes [nodes pred]
- (reduce
- (fn [updated-nodes node]
- (if-let [updated-node (pred node)]
- (let [children (traverse-nodes (-> updated-node :children) pred)]
- (conj updated-nodes (assoc updated-node :children children)))
- updated-nodes))
- []
- nodes))
-
-(defn traverse-html [html pred]
- (-> (html->nodes html)
- (traverse-nodes pred)
- nodes->html))
-
-(comment
- (traverse-html "<div>asdasd<span>hello</span></div>" #(when-not (= (-> % :name) :span)
- %))
- (html->coordinates "<div>hello<span>asd</span><strong>asdasdadad<img></strong></div>hello some text<div>another root element</div>")
- (html->nodes "<div>hello<span><img src=\"test.jpg\" ckcche/>asd</span><strong>asdasdadad</strong>")) \ No newline at end of file
diff --git a/src/dompa/html.cljc b/src/dompa/html.cljc
new file mode 100644
index 0000000..6268fa0
--- /dev/null
+++ b/src/dompa/html.cljc
@@ -0,0 +1,17 @@
+(ns dompa.html
+ (:require
+ [dompa.coordinates :as coordinates]))
+
+(defn ->coordinates
+ "Transform a `html` string into a vector of coordinates
+ indicating where an HTML node ends and begins."
+ [html]
+ (->> coordinates/compose
+ (coordinates/unify html)))
+
+(defn ->nodes
+ "Transform a `html` string into a tree of nodes,
+ each representing one HTML node and its children."
+ [html]
+ (->> (->coordinates html)
+ (coordinates/->nodes html))) \ No newline at end of file
diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj
deleted file mode 100644
index 82734a3..0000000
--- a/src/dompa/nodes.clj
+++ /dev/null
@@ -1,77 +0,0 @@
-(ns dompa.nodes
- (:require
- [clojure.string :as str]
- [dompa.coordinates :as coordinates]))
-
-(defn- html-str->node-name
- "Parses a given HTML string of a node to get its name as
- a keyword. A text node will return `:dompa/text`."
- [html]
- (if (str/starts-with? html "<")
- (->> (subs html 1)
- (take-while #(not (contains? #{\space \>} %)))
- (reduce str)
- keyword)
- :dompa/text))
-
-(defn- html-attr-str->k-v
- "Parses a given HTML node attribute string into a
- key-value pair."
- [attr]
- (->> (partition-by #(= % \=) attr)
- (filter #(not= (-> % first) \=))
- (map #(reduce str %))))
-
-(defn- normalize-html-attr-str
- "Normalizes a given HTML attribute string. If it
- has surrounding quotes, removes them."
- [html-attr-str]
- (if (str/starts-with? html-attr-str "\"")
- (->> (subs html-attr-str 1)
- (take-while #(not= % \"))
- (reduce str))
- html-attr-str))
-
-(defn- parse-html-attr-str
- "Parses a given HTML attribute into a normalized
- key-value map. Attributes with no value part are
- treated as boolean attributes, and are always `true`."
- [html-attr-str]
- (let [[k v] (html-attr-str->k-v html-attr-str)
- k (keyword k)
- v (if (nil? v) true (normalize-html-attr-str v))]
- {k v}))
-
-(defn- html-str->node-attrs [html]
- (when (str/starts-with? html "<")
- (->> (subs html 1)
- (take-while #(not (contains? #{\> \/} %)))
- (partition-by #(= % \space))
- (drop 1)
- (filter #(not= (-> % first) \space))
- (map parse-html-attr-str)
- (into {}))))
-
-(defn- construct-node
- [node-html node-children]
- (let [node-name (html-str->node-name node-html)]
- (merge
- {:name node-name}
- (when (= node-name :dompa/text)
- {:value node-html})
- (when-let [attrs (html-str->node-attrs node-html)]
- {:attrs attrs})
- (when node-children
- {:children node-children}))))
-
-(defn coordinates->nodes
- [html coordinates]
- (when (seq coordinates)
- (let [sorted-coordinates (sort-by first coordinates)
- [parent-from parent-to] (first sorted-coordinates)
- children (coordinates/children sorted-coordinates [parent-from parent-to])
- remaining (coordinates/without-children sorted-coordinates [parent-from parent-to])
- node-html (subs html parent-from (inc parent-to))
- node-children (coordinates->nodes html children)]
- (cons (construct-node node-html node-children)
- (coordinates->nodes html remaining)))))
diff --git a/src/dompa/nodes.cljc b/src/dompa/nodes.cljc
new file mode 100644
index 0000000..d035116
--- /dev/null
+++ b/src/dompa/nodes.cljc
@@ -0,0 +1,59 @@
+(ns dompa.nodes)
+
+(def ^:private default-void-nodes
+ #{:!doctype :area :base :br :col :embed :hr :img :input
+ :link :meta :source :track :wbr})
+
+(defn- node->html-reducer-fn
+ [void-nodes nodes->html-fn]
+ (fn [html node]
+ (cond
+ (= (-> node :name) :dompa/text)
+ (str html (-> node :value))
+
+ (contains? void-nodes (-> node :name))
+ (str "<" (-> node :name) ">")
+
+ :else
+ (let [value (nodes->html-fn (-> node :children))]
+ (str "<" (-> node :name) ">" value "</" (-> node :name) ">")))))
+
+(defn traverse
+ "Recursively traverses given tree of `nodes` with a `traverser-fn`
+ that gets a single node passed to it and returns a new updated tree.
+ If the traverses function returns `nil`, the node will be removed.
+ In any other case the node will be replaced. If you wish to keep
+ a node unchanged, just return it as-is."
+ [nodes traverser-fn]
+ (-> (fn [updated-nodes node]
+ (if-let [updated-node (traverser-fn node)]
+ (let [children (traverse (-> updated-node :children) traverser-fn)]
+ (conj updated-nodes (assoc updated-node :children children)))
+ updated-nodes))
+ (reduce [] nodes)))
+
+(defn ->html
+ "Transform a vector of `nodes` into an HTML string.
+
+ Options:
+ - `void-nodes` - A set of node names that are self-closing, defaults to:
+ - `:!doctype`
+ - `:area`
+ - `:base`
+ - `:br`
+ - `:col`
+ - `:embed`
+ - `:hr`
+ - `:img`
+ - `:input`
+ - `:link`
+ - `:meta`
+ - `:source`
+ - `:track`
+ - `:wbr`
+ "
+ ([nodes]
+ (->html nodes {:void-nodes default-void-nodes}))
+ ([nodes {:keys [void-nodes]}]
+ (-> (node->html-reducer-fn void-nodes ->html)
+ (reduce "" nodes)))) \ No newline at end of file
diff --git a/src/dompa/utils.cljc b/src/dompa/utils.cljc
new file mode 100644
index 0000000..c25e841
--- /dev/null
+++ b/src/dompa/utils.cljc
@@ -0,0 +1 @@
+(ns dompa.utils)