diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/dompa/coordinates.clj | 88 | ||||
| -rw-r--r-- | src/dompa/core.clj | 13 | ||||
| -rw-r--r-- | src/dompa/nodes.clj | 30 |
3 files changed, 131 insertions, 0 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj new file mode 100644 index 0000000..e78dd08 --- /dev/null +++ b/src/dompa/coordinates.clj @@ -0,0 +1,88 @@ +(ns dompa.coordinates + (:require [clojure.string :as str])) + +(defn- construct-coordinates + [{:keys [char-type start-idx coordinates] :as state} [idx c]] + (cond + ; we're undecided what to do next, + ; so we figure it out here + (nil? char-type) + {:char-type (if (some #{c} "<>") :tag :text) + :start-idx idx + :coordinates coordinates} + ; text ended, tag begins, which means we can + ; record text node coordinates + (and (= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates (conj coordinates [start-idx (dec idx)])} + + ; otherwise don't record anything, just note + ; the start of a tag + (and (not= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates coordinates} + + ; tag ended, record tag node coordinates + (= \> c) + {:char-type nil + :start-idx idx + :coordinates (conj coordinates [start-idx idx])} + + :else state)) + +(defn coordinates->tag-name [html [from to]] + (-> (subs html from to) + (str/split #"[\s\>]") + first + (str/replace #"[\<\>\/]" ""))) + +(defn- name-coordinates-fn [html] + (fn [idx coordinate] + [idx (coordinates->tag-name html coordinate)])) + +(defn- last-coordinate-by-tag-name-idx [html coordinates name start] + (let [filtered-coordinates (filter (fn [[_ end]] (< end start)) coordinates) + named-coordinates (map-indexed (name-coordinates-fn html) filtered-coordinates)] + (->> named-coordinates + (filter #(= name (-> % last))) + last + first))) + +(defn- merge-coordinate [html coordinates [start end]] + (let [name (coordinates->tag-name html [start end]) + matching-idx (last-coordinate-by-tag-name-idx html coordinates name start) + [matching-start _] (nth coordinates matching-idx)] + (assoc coordinates matching-idx [matching-start end]))) + +(defn- merge-coordinates-fn [html] + (fn [coordinates [start end]] + (if (and (= \< (nth html start)) + (= \/ (nth html (inc start) nil))) + (merge-coordinate html coordinates [start end]) + (conj coordinates [start end])))) + +(defn children + [coordinates [from to]] + (->> coordinates + (filter (fn [[iter-from iter-to]] + (and (< from iter-from) + (> to iter-to)))) + (sort-by first))) + +(defn without-children + [coordinates [parent-from parent-to]] + (->> coordinates + (remove (fn [[from to]] + (or (= from parent-from) + (and (> from parent-from) + (< to parent-to))))))) + +(defn html->coordinates [html] + (->> (map-indexed vector html) + (reduce construct-coordinates {:char-type nil :start-idx 0 :coordinates []}) + :coordinates + (reduce (merge-coordinates-fn html) []))) diff --git a/src/dompa/core.clj b/src/dompa/core.clj new file mode 100644 index 0000000..b97d5f2 --- /dev/null +++ b/src/dompa/core.clj @@ -0,0 +1,13 @@ +(ns dompa.core + (:require + [dompa.coordinates :refer [html->coordinates]] + [dompa.nodes :refer [coordinates->nodes]])) + +(defn html->nodes [html] + (let [coordinates (html->coordinates html) + nodes (coordinates->nodes html coordinates)] + nodes)) + +(comment + (html->coordinates "<div>hello<span>asd</span><strong>asdasdadad<img></strong></div>hello some text<div>another root element</div>") + (html->nodes "<div>hello<span>asd</span><strong>asdasdadad</strong></div>"))
\ No newline at end of file diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj new file mode 100644 index 0000000..9b30be3 --- /dev/null +++ b/src/dompa/nodes.clj @@ -0,0 +1,30 @@ +(ns dompa.nodes + (:require [clojure.string :as str] + [dompa.coordinates :as coordinates])) + +(defn- html->node-name [html] + (if (str/starts-with? html "<") + (-> html + (str/split #"[\s\>]") + first + (str/replace #"[\<\>\/]" "") + keyword) + :text-node)) + +(defn- html->node-attrs [html]) + + + +(defn coordinates->nodes + [html coordinates] + (when (seq coordinates) + (let [sorted-coordinates (sort-by first coordinates) + [parent-from parent-to] (first sorted-coordinates) + children (coordinates/children sorted-coordinates [parent-from parent-to]) + remaining (coordinates/without-children sorted-coordinates [parent-from parent-to]) + node-html (subs html parent-from (inc parent-to))] + (cons {:value (subs html parent-from (inc parent-to)) + :name (html->node-name node-html) + :attrs (html->node-attrs node-html) + :children (coordinates->nodes html children)} + (coordinates->nodes html remaining))))) |
