Files
kewt/awk/markdown_embed.awk
2026-03-07 12:28:30 +01:00

539 lines
15 KiB
Awk

function is_global_url(src) {
return (src ~ /^https?:\/\//)
}
function split_src(src, base, qpos, hpos, cutpos) {
base = src
qpos = index(base, "?")
hpos = index(base, "#")
cutpos = 0
if (qpos > 0) cutpos = qpos
if (hpos > 0 && (cutpos == 0 || hpos < cutpos)) cutpos = hpos
if (cutpos > 0) base = substr(base, 1, cutpos - 1)
return base
}
function ext_of(src, base, n, parts) {
base = split_src(src)
n = split(base, parts, ".")
if (n < 2) return ""
return tolower(parts[n])
}
function is_image_ext(ext) {
return (ext ~ /^(png|jpe?g|gif|bmp|webp|svg|ico)$/)
}
function is_audio_ext(ext) {
return (ext ~ /^(mp3|wav|ogg|m4a|aac|flac)$/)
}
function is_video_ext(ext) {
return (ext ~ /^(mp4|webm|ogv|mov|m4v)$/)
}
function is_inline_text_ext(ext) {
return (ext ~ /^(html|txt|md|css|js|mjs|cjs|json|xml|yml|yaml|toml|ini|conf|c|h|cpp|hpp|rs|go|py|sh|lua|php|java|kt|swift|sql|csv|tsv|log)$/)
}
function dirname_of(path, p) {
p = path
if (sub(/\/[^\/]*$/, "", p)) return p
return "."
}
function resolve_local_path(src, rel, candidate) {
rel = split_src(src)
if (substr(rel, 1, 1) == "/") {
rel = substr(rel, 2)
if (site_root != "") {
candidate = site_root "/" rel
if ((getline _tmp < candidate) >= 0) {
close(candidate)
return candidate
}
close(candidate)
}
candidate = rel
if ((getline _tmp < candidate) >= 0) {
close(candidate)
return candidate
}
close(candidate)
if (rel == "styles.css" && fallback_file != "") {
candidate = fallback_file
if ((getline _tmp < candidate) >= 0) {
close(candidate)
return candidate
}
close(candidate)
}
return ""
}
candidate = input_dir "/" rel
if ((getline _tmp < candidate) >= 0) {
close(candidate)
return candidate
}
close(candidate)
candidate = rel
if ((getline _tmp < candidate) >= 0) {
close(candidate)
return candidate
}
close(candidate)
return ""
}
function read_file(path, out, line, rc) {
out = ""
while ((rc = getline line < path) > 0) {
out = out line "\n"
}
close(path)
return out
}
function escape_html(s, t) {
t = s
gsub(/&/, "\\&amp;", t)
gsub(/</, "\\&lt;", t)
gsub(/>/, "\\&gt;", t)
return t
}
function css_highlight_line(line, m, prop, val) {
if (line ~ /^[[:space:]]*\/\*.*\*\/[[:space:]]*$/) {
return "<span class=\"tok-com\">" line "</span>"
}
if (line ~ /^[[:space:]]*[^{}][^{}]*\{[[:space:]]*$/) {
sub(/\{[[:space:]]*$/, "", line)
return "<span class=\"tok-sel\">" line "</span><span class=\"tok-punc\">{</span>"
}
if (line ~ /^[[:space:]]*\}[[:space:]]*$/) {
return "<span class=\"tok-punc\">}</span>"
}
if (match(line, /^([[:space:]]*)(--?[A-Za-z0-9_-]+)([[:space:]]*:[[:space:]]*)([^;]*)(;?[[:space:]]*)$/, m)) {
prop = "<span class=\"tok-prop\">" m[2] "</span>"
gsub(/var\(--[A-Za-z0-9_-]+\)/, "<span class=\"tok-var\">&</span>", m[4])
val = "<span class=\"tok-val\">" m[4] "</span>"
return m[1] prop m[3] val m[5]
}
return line
}
function highlight_code_block_line(line) {
return css_highlight_line(line)
}
function highlight_css_block(text, n, i, lines, out) {
n = split(text, lines, "\n")
out = ""
for (i = 1; i <= n; i++) {
out = out css_highlight_line(lines[i])
if (i < n) out = out "\n"
}
return out
}
function render_code_include(src, force_inline, ext, local_path, content) {
if (is_global_url(src)) return ""
ext = ext_of(src)
if (!force_inline && !is_inline_text_ext(ext)) return ""
local_path = resolve_local_path(src)
if (local_path == "") return ""
content = read_file(local_path)
if (content ~ /\n$/) sub(/\n$/, "", content)
content = escape_html(content)
if (ext == "css") {
content = highlight_css_block(content)
}
return content
}
function render_embed(src, alt, has_alt, force_inline, ext, local_path, content) {
if (force_inline && !is_global_url(src)) {
local_path = resolve_local_path(src)
if (local_path != "") {
content = read_file(local_path)
if (content ~ /\n$/) sub(/\n$/, "", content)
return content
}
}
ext = ext_of(src)
if (is_global_url(src)) {
if (is_image_ext(ext)) {
if (has_alt) return "<img alt=\"" alt "\" src=\"" src "\" />"
return "<img src=\"" src "\" />"
}
if (is_audio_ext(ext)) return "<audio controls src=\"" src "\"></audio>"
if (is_video_ext(ext)) return "<video controls src=\"" src "\"></video>"
return "<iframe src=\"" src "\"></iframe>"
}
if (is_image_ext(ext)) {
if (has_alt) return "<img alt=\"" alt "\" src=\"" src "\" />"
return "<img src=\"" src "\" />"
}
if (is_audio_ext(ext)) return "<audio controls src=\"" src "\"></audio>"
if (is_video_ext(ext)) return "<video controls src=\"" src "\"></video>"
if (is_inline_text_ext(ext)) {
local_path = resolve_local_path(src)
if (local_path != "") {
content = read_file(local_path)
if (content ~ /\n$/) sub(/\n$/, "", content)
return content
}
}
return "<iframe src=\"" src "\"></iframe>"
}
function extract_attr(tag, attr, pat, m, token) {
pat = attr "=\"[^\"]*\""
if (match(tag, pat)) {
token = substr(tag, RSTART, RLENGTH)
sub(/^[^=]*="/, "", token)
sub(/"$/, "", token)
return token
}
return ""
}
function trim_ws(s) {
sub(/^[[:space:]]+/, "", s)
sub(/[[:space:]]+$/, "", s)
return s
}
function extract_vertical_align(style, rest, part, pos, key, val) {
rest = style
while (rest != "") {
pos = index(rest, ";")
if (pos > 0) {
part = substr(rest, 1, pos - 1)
rest = substr(rest, pos + 1)
} else {
part = rest
rest = ""
}
part = trim_ws(part)
if (part == "") continue
pos = index(part, ":")
if (pos == 0) continue
key = tolower(trim_ws(substr(part, 1, pos - 1)))
val = trim_ws(substr(part, pos + 1))
if (key == "vertical-align" && val != "") return val
}
return ""
}
function td_has_vertical_align(td_tag, style_attr) {
style_attr = extract_attr(td_tag, "style")
if (style_attr == "") return 0
return (extract_vertical_align(style_attr) != "")
}
function add_td_vertical_align(td_tag, align, style_attr, repl) {
style_attr = extract_attr(td_tag, "style")
if (style_attr == "") {
sub(/>$/, " style=\"vertical-align: " align ";\">", td_tag)
return td_tag
}
repl = style_attr
if (repl !~ /;[[:space:]]*$/) repl = repl ";"
repl = repl " vertical-align: " align ";"
gsub(/&/, "\\&amp;", repl)
gsub(/</, "\\&lt;", repl)
gsub(/>/, "\\&gt;", repl)
sub("style=\"" style_attr "\"", "style=\"" repl "\"", td_tag)
return td_tag
}
function apply_td_vertical_align(line, out, rest, seg, td_tag, img_tag, after_td, after_img, style_attr, align, new_td) {
out = ""
rest = line
while (match(rest, /<td[^>]*>[[:space:]]*<img[^>]*>/)) {
out = out substr(rest, 1, RSTART - 1)
seg = substr(rest, RSTART, RLENGTH)
rest = substr(rest, RSTART + RLENGTH)
after_td = index(seg, ">")
if (after_td == 0) {
out = out seg
continue
}
td_tag = substr(seg, 1, after_td)
after_img = index(seg, "<img")
if (after_img == 0) {
out = out seg
continue
}
img_tag = substr(seg, after_img)
style_attr = extract_attr(img_tag, "style")
align = extract_vertical_align(style_attr)
if (align != "" && !td_has_vertical_align(td_tag)) {
new_td = add_td_vertical_align(td_tag, align)
seg = new_td substr(seg, after_td + 1)
}
out = out seg
}
return out rest
}
function rewrite_img_tags(line, out, rest, tag, src, alt, force_inline_tag, pre, post, repl) {
out = ""
rest = line
while (match(rest, /<img[^>]*\/?>/)) {
pre = substr(rest, 1, RSTART - 1)
tag = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
src = extract_attr(tag, "src")
alt = extract_attr(tag, "alt")
force_inline_tag = extract_attr(tag, "data-force-inline")
if (is_image_ext(ext_of(src)) && force_inline_tag == "") {
# Preserve hand-written <img> attributes (style/class/etc) for normal images.
repl = tag
} else {
repl = render_embed(src, alt, (alt != ""), (force_inline_tag != ""))
}
out = out pre repl
rest = post
}
return out rest
}
function rewrite_double_bang_with_parens(line, out, rest, token, inside, src, alt, sep, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!!\[[^]]*\]\([^)]*\)/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
inside = token
sub(/^!!\[/, "", inside)
sep = index(inside, "](")
alt = substr(inside, 1, sep - 1)
src = substr(inside, sep + 2)
sub(/\)$/, "", src)
repl = render_embed(src, alt, (alt != ""), 1)
out = out pre repl
rest = post
}
return out rest
}
function rewrite_double_bang_bare(line, out, rest, token, src, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!!\[[^]]+\]/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
src = token
sub(/^!!\[/, "", src)
sub(/\]$/, "", src)
repl = render_embed(src, "", 0, 1)
out = out pre repl
rest = post
}
return out rest
}
function rewrite_bare_bang(line, out, rest, token, src, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!\[[^]]+\]/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
src = token
sub(/^!\[/, "", src)
sub(/\]$/, "", src)
repl = render_embed(src, "", 0, 0)
out = out pre repl
rest = post
}
return out rest
}
function rewrite_noncode_line(line, out, rest, pstart, pend, code_seg, noncode) {
out = ""
rest = line
while (1) {
pstart = index(rest, "<code>")
if (pstart == 0) {
noncode = rest
noncode = rewrite_img_tags(noncode)
noncode = rewrite_double_bang_with_parens(noncode)
noncode = rewrite_double_bang_bare(noncode)
noncode = rewrite_bare_bang(noncode)
out = out noncode
break
}
noncode = substr(rest, 1, pstart - 1)
noncode = rewrite_img_tags(noncode)
noncode = rewrite_double_bang_with_parens(noncode)
noncode = rewrite_double_bang_bare(noncode)
noncode = rewrite_bare_bang(noncode)
out = out noncode
rest = substr(rest, pstart)
pend = index(rest, "</code>")
if (pend == 0) {
out = out rest
break
}
code_seg = substr(rest, 1, pend + length("</code>") - 1)
out = out code_seg
rest = substr(rest, pend + length("</code>"))
}
return out
}
function rewrite_code_double_bang_with_parens(line, out, rest, token, inside, src, sep, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!!\[[^]]*\]\([^)]*\)/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
inside = token
sub(/^!!\[/, "", inside)
sep = index(inside, "](")
src = substr(inside, sep + 2)
sub(/\)$/, "", src)
repl = render_code_include(src, 1)
if (repl == "") repl = token
out = out pre repl
rest = post
}
return out rest
}
function rewrite_code_double_bang_bare(line, out, rest, token, src, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!!\[[^]]+\]/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
src = token
sub(/^!!\[/, "", src)
sub(/\]$/, "", src)
repl = render_code_include(src, 1)
if (repl == "") repl = token
out = out pre repl
rest = post
}
return out rest
}
function rewrite_code_bang_with_parens(line, out, rest, token, inside, src, sep, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!\[[^]]*\]\([^)]*\)/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
inside = token
sub(/^!\[/, "", inside)
sep = index(inside, "](")
src = substr(inside, sep + 2)
sub(/\)$/, "", src)
repl = render_code_include(src, 0)
if (repl == "") repl = token
out = out pre repl
rest = post
}
return out rest
}
function rewrite_code_bare_bang(line, out, rest, token, src, pre, post, repl) {
out = ""
rest = line
while (match(rest, /!\[[^]]+\]/)) {
pre = substr(rest, 1, RSTART - 1)
token = substr(rest, RSTART, RLENGTH)
post = substr(rest, RSTART + RLENGTH)
src = token
sub(/^!\[/, "", src)
sub(/\]$/, "", src)
repl = render_code_include(src, 0)
if (repl == "") repl = token
out = out pre repl
rest = post
}
return out rest
}
function restore_plain_markers(line) {
gsub(/\034P0\034/, "*", line)
gsub(/\034P1\034/, "_", line)
gsub(/\034P2\034/, "`", line)
gsub(/\034P3\034/, "[", line)
gsub(/\034P4\034/, "]", line)
gsub(/\034P5\034/, "(", line)
gsub(/\034P6\034/, ")", line)
gsub(/\034P7\034/, "!", line)
gsub(/\034P8\034/, "$", line)
gsub(/<mfmplain>/, "<span class=\"mfm-plain\">", line)
gsub(/<\/mfmplain>/, "</span>", line)
return line
}
BEGIN {
input_dir = dirname_of(input_file)
in_pre_code = 0
}
{
line = $0
start_pre = (line ~ /<pre><code>/)
end_pre = (line ~ /<\/code><\/pre>/)
if (in_pre_code || start_pre) {
gsub(/\\!\[/, "\034ESC_BANG_OPEN\034", line)
line = rewrite_code_double_bang_with_parens(line)
line = rewrite_code_double_bang_bare(line)
line = rewrite_code_bang_with_parens(line)
line = rewrite_code_bare_bang(line)
gsub(/\034ESC_BANG_OPEN\034/, "![", line)
line = highlight_code_block_line(line)
} else {
line = rewrite_noncode_line(line)
}
line = apply_td_vertical_align(line)
line = restore_plain_markers(line)
print line
if (start_pre && !end_pre) {
in_pre_code = 1
} else if (in_pre_code && end_pre) {
in_pre_code = 0
}
}