summaryrefslogtreecommitdiff
path: root/src/dompa/coordinates.clj
diff options
context:
space:
mode:
authorAsko Nõmm <asko@nmm.ee>2025-08-24 19:03:27 +0300
committerAsko Nõmm <asko@nmm.ee>2025-08-24 19:03:27 +0300
commit41c1d9eadd3a5cb0a804390edca592c197f49d33 (patch)
tree6d760ef1e6f3ee4dc80f97a7de0ccfa7b02feed3 /src/dompa/coordinates.clj
Initial commit
Diffstat (limited to 'src/dompa/coordinates.clj')
-rw-r--r--src/dompa/coordinates.clj88
1 files changed, 88 insertions, 0 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj
new file mode 100644
index 0000000..e78dd08
--- /dev/null
+++ b/src/dompa/coordinates.clj
@@ -0,0 +1,88 @@
+(ns dompa.coordinates
+ (:require [clojure.string :as str]))
+
+(defn- construct-coordinates
+ [{:keys [char-type start-idx coordinates] :as state} [idx c]]
+ (cond
+ ; we're undecided what to do next,
+ ; so we figure it out here
+ (nil? char-type)
+ {:char-type (if (some #{c} "<>") :tag :text)
+ :start-idx idx
+ :coordinates coordinates}
+ ; text ended, tag begins, which means we can
+ ; record text node coordinates
+ (and (= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx (dec idx)])}
+
+ ; otherwise don't record anything, just note
+ ; the start of a tag
+ (and (not= :text char-type)
+ (= \< c))
+ {:char-type :tag
+ :start-idx idx
+ :coordinates coordinates}
+
+ ; tag ended, record tag node coordinates
+ (= \> c)
+ {:char-type nil
+ :start-idx idx
+ :coordinates (conj coordinates [start-idx idx])}
+
+ :else state))
+
+(defn coordinates->tag-name [html [from to]]
+ (-> (subs html from to)
+ (str/split #"[\s\>]")
+ first
+ (str/replace #"[\<\>\/]" "")))
+
+(defn- name-coordinates-fn [html]
+ (fn [idx coordinate]
+ [idx (coordinates->tag-name html coordinate)]))
+
+(defn- last-coordinate-by-tag-name-idx [html coordinates name start]
+ (let [filtered-coordinates (filter (fn [[_ end]] (< end start)) coordinates)
+ named-coordinates (map-indexed (name-coordinates-fn html) filtered-coordinates)]
+ (->> named-coordinates
+ (filter #(= name (-> % last)))
+ last
+ first)))
+
+(defn- merge-coordinate [html coordinates [start end]]
+ (let [name (coordinates->tag-name html [start end])
+ matching-idx (last-coordinate-by-tag-name-idx html coordinates name start)
+ [matching-start _] (nth coordinates matching-idx)]
+ (assoc coordinates matching-idx [matching-start end])))
+
+(defn- merge-coordinates-fn [html]
+ (fn [coordinates [start end]]
+ (if (and (= \< (nth html start))
+ (= \/ (nth html (inc start) nil)))
+ (merge-coordinate html coordinates [start end])
+ (conj coordinates [start end]))))
+
+(defn children
+ [coordinates [from to]]
+ (->> coordinates
+ (filter (fn [[iter-from iter-to]]
+ (and (< from iter-from)
+ (> to iter-to))))
+ (sort-by first)))
+
+(defn without-children
+ [coordinates [parent-from parent-to]]
+ (->> coordinates
+ (remove (fn [[from to]]
+ (or (= from parent-from)
+ (and (> from parent-from)
+ (< to parent-to)))))))
+
+(defn html->coordinates [html]
+ (->> (map-indexed vector html)
+ (reduce construct-coordinates {:char-type nil :start-idx 0 :coordinates []})
+ :coordinates
+ (reduce (merge-coordinates-fn html) [])))