diff options
| author | Asko Nõmm <asko@nmm.ee> | 2025-08-24 19:03:27 +0300 |
|---|---|---|
| committer | Asko Nõmm <asko@nmm.ee> | 2025-08-24 19:03:27 +0300 |
| commit | 41c1d9eadd3a5cb0a804390edca592c197f49d33 (patch) | |
| tree | 6d760ef1e6f3ee4dc80f97a7de0ccfa7b02feed3 /src/dompa/coordinates.clj | |
Initial commit
Diffstat (limited to 'src/dompa/coordinates.clj')
| -rw-r--r-- | src/dompa/coordinates.clj | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/src/dompa/coordinates.clj b/src/dompa/coordinates.clj new file mode 100644 index 0000000..e78dd08 --- /dev/null +++ b/src/dompa/coordinates.clj @@ -0,0 +1,88 @@ +(ns dompa.coordinates + (:require [clojure.string :as str])) + +(defn- construct-coordinates + [{:keys [char-type start-idx coordinates] :as state} [idx c]] + (cond + ; we're undecided what to do next, + ; so we figure it out here + (nil? char-type) + {:char-type (if (some #{c} "<>") :tag :text) + :start-idx idx + :coordinates coordinates} + ; text ended, tag begins, which means we can + ; record text node coordinates + (and (= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates (conj coordinates [start-idx (dec idx)])} + + ; otherwise don't record anything, just note + ; the start of a tag + (and (not= :text char-type) + (= \< c)) + {:char-type :tag + :start-idx idx + :coordinates coordinates} + + ; tag ended, record tag node coordinates + (= \> c) + {:char-type nil + :start-idx idx + :coordinates (conj coordinates [start-idx idx])} + + :else state)) + +(defn coordinates->tag-name [html [from to]] + (-> (subs html from to) + (str/split #"[\s\>]") + first + (str/replace #"[\<\>\/]" ""))) + +(defn- name-coordinates-fn [html] + (fn [idx coordinate] + [idx (coordinates->tag-name html coordinate)])) + +(defn- last-coordinate-by-tag-name-idx [html coordinates name start] + (let [filtered-coordinates (filter (fn [[_ end]] (< end start)) coordinates) + named-coordinates (map-indexed (name-coordinates-fn html) filtered-coordinates)] + (->> named-coordinates + (filter #(= name (-> % last))) + last + first))) + +(defn- merge-coordinate [html coordinates [start end]] + (let [name (coordinates->tag-name html [start end]) + matching-idx (last-coordinate-by-tag-name-idx html coordinates name start) + [matching-start _] (nth coordinates matching-idx)] + (assoc coordinates matching-idx [matching-start end]))) + +(defn- merge-coordinates-fn [html] + (fn [coordinates [start end]] + (if (and (= \< (nth html start)) + (= \/ (nth html (inc start) nil))) + (merge-coordinate html coordinates [start end]) + (conj coordinates [start end])))) + +(defn children + [coordinates [from to]] + (->> coordinates + (filter (fn [[iter-from iter-to]] + (and (< from iter-from) + (> to iter-to)))) + (sort-by first))) + +(defn without-children + [coordinates [parent-from parent-to]] + (->> coordinates + (remove (fn [[from to]] + (or (= from parent-from) + (and (> from parent-from) + (< to parent-to))))))) + +(defn html->coordinates [html] + (->> (map-indexed vector html) + (reduce construct-coordinates {:char-type nil :start-idx 0 :coordinates []}) + :coordinates + (reduce (merge-coordinates-fn html) []))) |
