summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAsko Nõmm <asko@nmm.ee>2025-08-24 19:03:27 +0300
committerAsko Nõmm <asko@nmm.ee>2025-08-24 19:03:27 +0300
commit41c1d9eadd3a5cb0a804390edca592c197f49d33 (patch)
tree6d760ef1e6f3ee4dc80f97a7de0ccfa7b02feed3
Initial commit
-rw-r--r--.gitignore5
-rw-r--r--deps.edn1
-rw-r--r--src/dompa/coordinates.clj88
-rw-r--r--src/dompa/core.clj13
-rw-r--r--src/dompa/nodes.clj30
5 files changed, 137 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..79c77e1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.idea/
+.nrepl-port
+.clj-kondo/
+dompa.iml
+.cpcache/
diff --git a/deps.edn b/deps.edn
new file mode 100644
index 0000000..024a763
--- /dev/null
+++ b/deps.edn
@@ -0,0 +1 @@
+{:deps {}} \ No newline at end of file
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj
new file mode 100644
index 0000000..e78dd08
--- /dev/null
+++ b/src/dompa/coordinates.clj
@@ -0,0 +1,88 @@
+(ns dompa.coordinates
+ (:require [clojure.string :as str]))
+
+(defn- construct-coordinates
+ [{:keys [char-type start-idx coordinates] :as state} [idx c]]
+ (cond
+ ; we're undecided what to do next,
+ ; so we figure it out here
+ (nil? char-type)
+ {:char-type (if (some #{c} "<>") :tag :text)
+ :start-idx idx
+ :coordinates coordinates}
+ ; text ended, tag begins, which means we can
+ ; record text node coordinates
+ (and (= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx (dec idx)])}
+
+ ; otherwise don't record anything, just note
+ ; the start of a tag
+ (and (not= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates coordinates}
+
+ ; tag ended, record tag node coordinates
+ (= \> c)
+ {:char-type nil
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx idx])}
+
+ :else state))
+
+(defn coordinates->tag-name [html [from to]]
+ (-> (subs html from to)
+ (str/split #"[\s\>]")
+ first
+ (str/replace #"[\<\>\/]" "")))
+
+(defn- name-coordinates-fn [html]
+ (fn [idx coordinate]
+ [idx (coordinates->tag-name html coordinate)]))
+
+(defn- last-coordinate-by-tag-name-idx [html coordinates name start]
+ (let [filtered-coordinates (filter (fn [[_ end]] (< end start)) coordinates)
+ named-coordinates (map-indexed (name-coordinates-fn html) filtered-coordinates)]
+ (->> named-coordinates
+ (filter #(= name (-> % last)))
+ last
+ first)))
+
+(defn- merge-coordinate [html coordinates [start end]]
+ (let [name (coordinates->tag-name html [start end])
+ matching-idx (last-coordinate-by-tag-name-idx html coordinates name start)
+ [matching-start _] (nth coordinates matching-idx)]
+ (assoc coordinates matching-idx [matching-start end])))
+
+(defn- merge-coordinates-fn [html]
+ (fn [coordinates [start end]]
+ (if (and (= \< (nth html start))
+ (= \/ (nth html (inc start) nil)))
+ (merge-coordinate html coordinates [start end])
+ (conj coordinates [start end]))))
+
+(defn children
+ [coordinates [from to]]
+ (->> coordinates
+ (filter (fn [[iter-from iter-to]]
+ (and (< from iter-from)
+ (> to iter-to))))
+ (sort-by first)))
+
+(defn without-children
+ [coordinates [parent-from parent-to]]
+ (->> coordinates
+ (remove (fn [[from to]]
+ (or (= from parent-from)
+ (and (> from parent-from)
+ (< to parent-to)))))))
+
+(defn html->coordinates [html]
+ (->> (map-indexed vector html)
+ (reduce construct-coordinates {:char-type nil :start-idx 0 :coordinates []})
+ :coordinates
+ (reduce (merge-coordinates-fn html) [])))
diff --git a/src/dompa/core.clj b/src/dompa/core.clj
new file mode 100644
index 0000000..b97d5f2
--- /dev/null
+++ b/src/dompa/core.clj
@@ -0,0 +1,13 @@
+(ns dompa.core
+ (:require
+ [dompa.coordinates :refer [html->coordinates]]
+ [dompa.nodes :refer [coordinates->nodes]]))
+
+(defn html->nodes [html]
+ (let [coordinates (html->coordinates html)
+ nodes (coordinates->nodes html coordinates)]
+ nodes))
+
+(comment
+ (html->coordinates "<div>hello<span>asd</span><strong>asdasdadad<img></strong></div>hello some text<div>another root element</div>")
+ (html->nodes "<div>hello<span>asd</span><strong>asdasdadad</strong></div>")) \ No newline at end of file
diff --git a/src/dompa/nodes.clj b/src/dompa/nodes.clj
new file mode 100644
index 0000000..9b30be3
--- /dev/null
+++ b/src/dompa/nodes.clj
@@ -0,0 +1,30 @@
+(ns dompa.nodes
+ (:require [clojure.string :as str]
+ [dompa.coordinates :as coordinates]))
+
+(defn- html->node-name [html]
+ (if (str/starts-with? html "<")
+ (-> html
+ (str/split #"[\s\>]")
+ first
+ (str/replace #"[\<\>\/]" "")
+ keyword)
+ :text-node))
+
+(defn- html->node-attrs [html])
+
+
+
+(defn coordinates->nodes
+ [html coordinates]
+ (when (seq coordinates)
+ (let [sorted-coordinates (sort-by first coordinates)
+ [parent-from parent-to] (first sorted-coordinates)
+ children (coordinates/children sorted-coordinates [parent-from parent-to])
+ remaining (coordinates/without-children sorted-coordinates [parent-from parent-to])
+ node-html (subs html parent-from (inc parent-to))]
+ (cons {:value (subs html parent-from (inc parent-to))
+ :name (html->node-name node-html)
+ :attrs (html->node-attrs node-html)
+ :children (coordinates->nodes html children)}
+ (coordinates->nodes html remaining)))))