diff options
Diffstat (limited to 'src/dompa')
| -rw-r--r-- | src/dompa/coordinates.clj | 20 | ||||
| -rw-r--r-- | src/dompa/core.clj | 30 | ||||
| -rw-r--r-- | src/dompa/nodes.clj | 39 |
3 files changed, 49 insertions, 40 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj index e78dd08..1b18472 100644 --- a/src/dompa/coordinates.clj +++ b/src/dompa/coordinates.clj @@ -1,7 +1,7 @@ (ns dompa.coordinates (:require [clojure.string :as str])) -(defn- construct-coordinates +(defn- construct-coordinates-reducer [{:keys [char-type start-idx coordinates] :as state} [idx c]] (cond ; we're undecided what to do next, @@ -10,6 +10,7 @@ {:char-type (if (some #{c} "<>") :tag :text) :start-idx idx :coordinates coordinates} + ; text ended, tag begins, which means we can ; record text node coordinates (and (= :text char-type) @@ -34,6 +35,12 @@ :else state)) +(defn- construct-coordinates + [indexed-html] + (->> indexed-html + (reduce construct-coordinates-reducer {:char-type nil :start-idx 0 :coordinates []}) + :coordinates)) + (defn coordinates->tag-name [html [from to]] (-> (subs html from to) (str/split #"[\s\>]") @@ -58,13 +65,17 @@ [matching-start _] (nth coordinates matching-idx)] (assoc coordinates matching-idx [matching-start end]))) -(defn- merge-coordinates-fn [html] +(defn- merge-coordinates-reducer-fn [html] (fn [coordinates [start end]] (if (and (= \< (nth html start)) (= \/ (nth html (inc start) nil))) (merge-coordinate html coordinates [start end]) (conj coordinates [start end])))) +(defn merge-coordinates [html] + (-> (merge-coordinates-reducer-fn html) + (reduce []))) + (defn children [coordinates [from to]] (->> coordinates @@ -83,6 +94,5 @@ (defn html->coordinates [html] (->> (map-indexed vector html) - (reduce construct-coordinates {:char-type nil :start-idx 0 :coordinates []}) - :coordinates - (reduce (merge-coordinates-fn html) []))) + construct-coordinates + merge-coordinates)) diff --git a/src/dompa/core.clj b/src/dompa/core.clj index d8fff41..79d0c08 100644 --- a/src/dompa/core.clj +++ b/src/dompa/core.clj @@ -4,23 +4,25 @@ [dompa.nodes :refer [coordinates->nodes]])) (defn html->nodes [html] - (->> html - html->coordinates + (->> (html->coordinates html) (coordinates->nodes html))) -(defn nodes->html [nodes] - (reduce - (fn [html node] - (cond - (= (-> node :name) :dompa/text) - (str html (-> node :value)) +(defn nodes->html + ([nodes] + (nodes->html nodes {:void-nodes #{:img}})) + ([nodes {:keys [void-nodes]}] + (reduce + (fn [html node] + (cond + (= (-> node :name) :dompa/text) + (str html (-> node :value)) - :else - (let [node-name (-> node :name name) - node-child-html (nodes->html (-> node :children))] - (str html "<" node-name ">" node-child-html "</" node-name ">")))) - "" - nodes)) + :else + (let [node-name (-> node :name name) + node-child-html (nodes->html (-> node :children))] + (str html "<" node-name ">" node-child-html "</" node-name ">")))) + "" + nodes))) (defn traverse-nodes [nodes pred] (reduce diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj index eb96027..72f89ff 100644 --- a/src/dompa/nodes.clj +++ b/src/dompa/nodes.clj @@ -3,7 +3,7 @@ [clojure.string :as str] [dompa.coordinates :as coordinates])) -(defn- html->node-name +(defn- html-str->node-name "Parses a given HTML string of a node to get its name as a keyword. A text node will return `:dompa/text`." [html] @@ -14,7 +14,7 @@ keyword) :dompa/text)) -(defn- attr->k-v +(defn- html-attr-str->k-v "Parses a given HTML node attribute string into a key-value pair." [attr] @@ -22,52 +22,49 @@ (filter #(not= (-> % first) \=)) (map #(reduce str %)))) -(defn- normalize-attr-str +(defn- normalize-html-attr-str "Normalizes a given HTML attribute string. If it has surrounding quotes, removes them." - [attr-str] - (if (str/starts-with? attr-str "\"") - (->> (subs attr-str 1) + [html-attr-str] + (if (str/starts-with? html-attr-str "\"") + (->> (subs html-attr-str 1) (take-while #(not= % \")) (reduce str)) - attr-str)) + html-attr-str)) -(defn- parse-attr +(defn- parse-html-attr-str "Parses a given HTML attribute into a normalized key-value map. Attributes with no value part are treated as boolean attributes, and are always `true`." - [attr] - (let [[k v] (attr->k-v attr) + [html-attr-str] + (let [[k v] (html-attr-str->k-v html-attr-str) k (keyword k) - v (if (nil? v) true (normalize-attr-str v))] + v (if (nil? v) true (normalize-html-attr-str v))] {k v})) -(defn- html->node-attrs [html] +(defn- html-str->node-attrs [html] (when (str/starts-with? html "<") (->> (subs html 1) (take-while #(not (contains? #{\> \/} %))) (partition-by #(= % \space)) (drop 1) (filter #(not= (-> % first) \space)) - (map parse-attr) + (map parse-html-attr-str) (into {})))) -(comment - (html->node-attrs "<img src=\"test.jpg\" checked />")) - (defn- construct-node [node-html node-children] - (let [node-name (html->node-name node-html)] + (let [node-name (html-str->node-name node-html)] (merge {:name node-name} (when (= node-name :dompa/text) {:value node-html}) - (when-let [attrs (html->node-attrs node-html)] + (when-let [attrs (html-str->node-attrs node-html)] {:attrs attrs}) (when node-children {:children node-children})))) -(defn coordinates->nodes +(defn html-coordinates->nodes [html coordinates] (when (seq coordinates) (let [sorted-coordinates (sort-by first coordinates) @@ -75,6 +72,6 @@ children (coordinates/children sorted-coordinates [parent-from parent-to]) remaining (coordinates/without-children sorted-coordinates [parent-from parent-to]) node-html (subs html parent-from (inc parent-to)) - node-children (coordinates->nodes html children)] + node-children (html-coordinates->nodes html children)] (cons (construct-node node-html node-children) - (coordinates->nodes html remaining))))) + (html-coordinates->nodes html remaining))))) |
