diff options
Diffstat (limited to 'src/dompa')
| -rw-r--r-- | src/dompa/core.clj | 37 | ||||
| -rw-r--r-- | src/dompa/nodes.clj | 80 |
2 files changed, 99 insertions, 18 deletions
diff --git a/src/dompa/core.clj b/src/dompa/core.clj index b97d5f2..d8fff41 100644 --- a/src/dompa/core.clj +++ b/src/dompa/core.clj @@ -4,10 +4,41 @@ [dompa.nodes :refer [coordinates->nodes]])) (defn html->nodes [html] - (let [coordinates (html->coordinates html) - nodes (coordinates->nodes html coordinates)] + (->> html + html->coordinates + (coordinates->nodes html))) + +(defn nodes->html [nodes] + (reduce + (fn [html node] + (cond + (= (-> node :name) :dompa/text) + (str html (-> node :value)) + + :else + (let [node-name (-> node :name name) + node-child-html (nodes->html (-> node :children))] + (str html "<" node-name ">" node-child-html "</" node-name ">")))) + "" nodes)) +(defn traverse-nodes [nodes pred] + (reduce + (fn [updated-nodes node] + (if-let [updated-node (pred node)] + (let [children (traverse-nodes (-> updated-node :children) pred)] + (conj updated-nodes (assoc updated-node :children children))) + updated-nodes)) + [] + nodes)) + +(defn traverse-html [html pred] + (-> (html->nodes html) + (traverse-nodes pred) + nodes->html)) + (comment + (traverse-html "<div>asdasd<span>hello</span></div>" #(when-not (= (-> % :name) :span) + %)) (html->coordinates "<div>hello<span>asd</span><strong>asdasdadad<img></strong></div>hello some text<div>another root element</div>") - (html->nodes "<div>hello<span>asd</span><strong>asdasdadad</strong></div>"))
\ No newline at end of file + (html->nodes "<div>hello<span><img src=\"test.jpg\" ckcche/>asd</span><strong>asdasdadad</strong>"))
\ No newline at end of file diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj index 9b30be3..eb96027 100644 --- a/src/dompa/nodes.clj +++ b/src/dompa/nodes.clj @@ -1,19 +1,71 @@ (ns dompa.nodes - (:require [clojure.string :as str] - [dompa.coordinates :as coordinates])) + (:require + [clojure.string :as str] + [dompa.coordinates :as coordinates])) -(defn- html->node-name [html] +(defn- html->node-name + "Parses a given HTML string of a node to get its name as + a keyword. A text node will return `:dompa/text`." + [html] (if (str/starts-with? html "<") - (-> html - (str/split #"[\s\>]") - first - (str/replace #"[\<\>\/]" "") - keyword) - :text-node)) + (->> (subs html 1) + (take-while #(not (contains? #{\space \>} %))) + (reduce str) + keyword) + :dompa/text)) -(defn- html->node-attrs [html]) +(defn- attr->k-v + "Parses a given HTML node attribute string into a + key-value pair." + [attr] + (->> (partition-by #(= % \=) attr) + (filter #(not= (-> % first) \=)) + (map #(reduce str %)))) +(defn- normalize-attr-str + "Normalizes a given HTML attribute string. If it + has surrounding quotes, removes them." + [attr-str] + (if (str/starts-with? attr-str "\"") + (->> (subs attr-str 1) + (take-while #(not= % \")) + (reduce str)) + attr-str)) +(defn- parse-attr + "Parses a given HTML attribute into a normalized + key-value map. Attributes with no value part are + treated as boolean attributes, and are always `true`." + [attr] + (let [[k v] (attr->k-v attr) + k (keyword k) + v (if (nil? v) true (normalize-attr-str v))] + {k v})) + +(defn- html->node-attrs [html] + (when (str/starts-with? html "<") + (->> (subs html 1) + (take-while #(not (contains? #{\> \/} %))) + (partition-by #(= % \space)) + (drop 1) + (filter #(not= (-> % first) \space)) + (map parse-attr) + (into {})))) + +(comment + (html->node-attrs "<img src=\"test.jpg\" checked />")) + +(defn- construct-node + [node-html node-children] + (let [node-name (html->node-name node-html)] + (merge + {:name node-name} + (when (= node-name :dompa/text) + {:value node-html}) + (when-let [attrs (html->node-attrs node-html)] + {:attrs attrs}) + (when node-children + {:children node-children})))) (defn coordinates->nodes [html coordinates] @@ -22,9 +74,7 @@ [parent-from parent-to] (first sorted-coordinates) children (coordinates/children sorted-coordinates [parent-from parent-to]) remaining (coordinates/without-children sorted-coordinates [parent-from parent-to]) - node-html (subs html parent-from (inc parent-to))] - (cons {:value (subs html parent-from (inc parent-to)) - :name (html->node-name node-html) - :attrs (html->node-attrs node-html) - :children (coordinates->nodes html children)} + node-html (subs html parent-from (inc parent-to)) + node-children (coordinates->nodes html children)] + (cons (construct-node node-html node-children) (coordinates->nodes html remaining))))) |
