[Cryptech-Commits] [user/sra/pelican] 57/68: Refactor Wiki2Markdown as class

git at cryptech.is git at cryptech.is
Mon Jul 19 22:25:36 UTC 2021


This is an automated email from the git hooks/post-receive script.

sra at hactrn.net pushed a commit to branch pelican
in repository user/sra/pelican.

commit a4a027a6700e1197a50b24bdfbde1697049b7348
Author: Rob Austein <sra at hactrn.net>
AuthorDate: Tue Feb 16 06:46:38 2021 +0000

    Refactor Wiki2Markdown as class
    
    Makes portions of the mess clearer, not as much as I'd hoped.
    
    With a bit more work we could fold header-crunching into the existing
    regexp mechanism, but code blocks, links, and tables require state.
---
 extract.py |   4 +-
 trac2md.py | 335 ++++++++++++++++++++++++++++++-------------------------------
 2 files changed, 168 insertions(+), 171 deletions(-)

diff --git a/extract.py b/extract.py
index c7e35b9..793b502 100755
--- a/extract.py
+++ b/extract.py
@@ -92,6 +92,8 @@ def main():
 
     os.link("pelicanconf.py", "pelican/pelicanconf.py")
 
+    wiki_to_markdown = trac2md.Trac2Markdown()
+
     keep = Filter()
 
     first_published = {}
@@ -105,7 +107,7 @@ def main():
             #print(slug, row.version)
             with open("wiki/{}.trac".format(slug), "w") as f:
                 f.write(row.text)
-            md = markdown_header(row, first_published) + trac2md.WikiToMD(row.text, slug)
+            md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
             with open("pelican/content/{}.md".format(slug), "w") as f:
                 f.write(md)
 
diff --git a/trac2md.py b/trac2md.py
index e16845b..c7cf85e 100755
--- a/trac2md.py
+++ b/trac2md.py
@@ -5,192 +5,187 @@
 import re
 from urllib.parse import quote
 
-content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
-
-traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]")
-
-image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]")
-
-wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]")
-
-strikethrough_pattern = re.compile(r"~~([^~]+)~~")
-bangquote_pattern = re.compile(r"!((?:\w|[#])+)")
-linebreak_pattern = re.compile(r"\\\\$")
-
-camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
-
-span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]")
-
-delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I)
-
-wikiheading_patterns = tuple(
-    (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
-    for level in range(1, 7))
-
-
-def convert_headers(line):
-    for level_count, header in wikiheading_patterns:
-        try:
-            level = header.search(line).group(1)
-            if level:
-                line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
-                break          # No need to check other heading levels
-        except:
-            pass                # Try the next heading level
-    return line
-
-
-def convert_traclink_to_creolelink(m):
-    # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
-    # Creole's is easier to parse and harder to confuse with partially converted Markdown.
-
-    text = m.group(1).strip()
-    if " " in text:
-        return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
-    elif ":" in text or camelcase_pattern.match(text):
-        return "[[{}]]".format(text)
-    else:
-        return m.group(0)
-
-
-# Probably most of the non-wiki scheme tests should become a table in an
-# extended JSON config file which maps
-#
-#   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
-
-def convert_wikilinks(m, slug, giturl):
-    scheme, link, text = [p.strip() if p else p for p in  m.groups()]
-    if text is None:
-        text = link
-    if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
-        link = link[1:-1]
-    if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
-        text = text[1:-1]
-    if text == link and link.startswith("http") and "://" in link:
-        return "<{}>".format(link)
-    elif scheme == "attachment:":
-        return "[{}]({{attach}}{}/{})".format(text, slug, link)
-    elif scheme in ("source:", "browser:"):
-        return "[{}]({}/{})".format(text, giturl.rstrip("/"), link.lstrip("/"))
-    elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)):
-        return "[{}]({{filename}}{}.md)".format(text, link)
-    else:
-        return "[{}]({})".format(text, link)
-
-
-def convert_image(m, slug):
-    text = m.group(1).split(",")[0].strip()
-    if "://" in text:
-        return "<img src=\"{}\">".format(text)
-    else:
-        return "![{}]({{attach}}{}/{})".format(text, slug, quote(text, ""))
-
-
-def WikiToMD(content, slug):
-
-    code_block = False
-    in_list = False
-    in_table = False
-    nested_level = 0
-    prev_indent = 0
-    old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
-    new_content = []
-
-    while old_content:
-        line = old_content.pop(0).rstrip()
-        tail = ["\n"]
-        while "{{{" in line or "}}}" in line:
-            if "{{{" in line:
-                code_block = True
-                line = line.replace("{{{", "```")
-            if "}}}" in line:
-                code_block = False
-                line = line.replace("}}}", "```")
-        if not code_block:
+class Trac2Markdown:
+
+    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
+
+    wikiheading_patterns = tuple(
+        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
+        for level in range(1, 7)
+    )
+
+    def convert_headers(self, line):
+        for level_count, header in self.wikiheading_patterns:
+            try:
+                level = header.search(line).group(1)
+                if level:
+                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
+                    break          # No need to check other heading levels
+            except:
+                pass                # Try the next heading level
+        return line
+
+    def convert_to_creole(self, m):
+        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+        # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+        text = m.group(1).strip()
+        if " " in text:
+            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
+        elif ":" in text or self.camelcase_pattern.match(text):
+            return "[[{}]]".format(text)
+        else:
+            return m.group(0)
+
+    # Probably most of the non-wiki scheme tests should become a table in an
+    # extended JSON config file which maps
+    #
+    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
+
+    def convert_wikilinks(self, m):
+        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
+        if text is None:
+            text = link
+        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+            link = link[1:-1]
+        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+            text = text[1:-1]
+        if text == link and link.startswith("http") and "://" in link:
+            return "<{}>".format(link)
+        elif scheme == "attachment:":
+            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
+        elif scheme in ("source:", "browser:"):
+            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
+        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
+            return "[{}]({{filename}}{}.md)".format(text, link)
+        else:
+            return "[{}]({})".format(text, link)
+
+    def convert_image(self, m):
+        text = m.group(1).split(",")[0].strip()
+        if "://" in text:
+            return "<img src=\"{}\">".format(text)
+        else:
+            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
+
+    def __init__(self, source_url = "https://git.cryptech.is/"):
+        self.source_url = source_url
+        self.pattern_actions = (
 
             # Convert CamelCase links to explicit links
-            line = camelcase_pattern.sub(r"[[\1]]", line)
-
-            # Convert TracLinks to WikiCreole links to simplify remaining processing
-            line = traclink_pattern.sub(convert_traclink_to_creolelink, line)
-
-            # Convert tables.  References:
-            #   https://github.github.com/gfm/#tables-extension-
-            #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
-            # Table start: line containing "||"; table end: blank line?
-            #
-            # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
-            # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
-            # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
-            # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
-            # if the rows do anything different, ouch, because markdown specifies in delimiter line.
-            #
-            # Might do something clever with the "=" markers and alignment, start with just getting the basic table
-            # structure to something markdown will believe.
-
-            if line.strip().startswith("||"):
-                line = line.replace("=|", "|").replace("|=", "|")
-                line = line.replace("||", "|")
-                if not in_table:
-                    tail.append("|---" * (line.count("|") - 1) + "|\n")
-                in_table = True
-            elif in_table and not line.strip().startswith("||"):
-                new_content.append("\n")
-                in_table = False
-
-            #
-            # Convert bullet lists.  The start and end of a list needs an empty line.
-            #
-            nested_line = line.lstrip(' ')
-            if nested_line.startswith('- ') or nested_line.startswith('* '):
-                if not in_list:
-                    new_content.append("\n")
-                    nested_level = 0
-                    prev_indent = 0
-                    in_list = True
-                indent = len(line) - len(nested_line)
-                if indent > prev_indent:
-                    nested_level += 1
-                elif indent < prev_indent:
-                    nested_level -= 1
-                prev_indent = indent
-                line = '    ' * nested_level + nested_line
-            elif in_list:
-                new_content.append("\n")
-                in_list = False
-                nested_level = 0
-                prev_indent = 0
+            (self.camelcase_pattern,                                                    r"[[\1]]"),
+
+            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
+            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),
 
             # Convert !x quoting
-            line = bangquote_pattern.sub(r"\1", line)
+            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),
 
             # Convert (limited subset of) spans
-            line = span_pattern.sub(r"\1", line)
-
-            # Convert headers
-            line = convert_headers(line)
+            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),
 
             # Convert images
-            line = image_pattern.sub(lambda m: convert_image(m, slug), line)
+            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),
 
             # Delete Trac macros that have no useful counterpart
-            line = delete_pattern.sub("", line)
+            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),
 
             # Convert wiki links
-            line = wikilink_pattern.sub(lambda m: convert_wikilinks(m, slug, "https://git.cryptech.is/"), line)
+            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),
 
             # Convert striked through text
-            line = strikethrough_pattern.sub(r"<s>\1</s>", line)
+            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),
 
-            # Convert line breaks
-            # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
-            line = linebreak_pattern.sub("  ", line)
+            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+            (re.compile(r"\\\\$"),                                                      r"  "),
 
             # Convert bold and italic text (do this last)
-            line = line.replace("'''", "**")  # Convert bold text
-            line = line.replace("''", "*")  # Convert italic text
+            (re.compile(r"'''"),                                                        r"**"),
+            (re.compile(r"''"),                                                         r"*"),
+        )
+
+    def __call__(self, content, slug):
+        self.slug = slug
+
+        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
+        new_content = []
+
+        code_block = False
+        in_list = False
+        in_table = False
+        nested_level = 0
+        prev_indent = 0
+
+        while old_content:
+            line = old_content.pop(0).rstrip()
+            tail = ["\n"]
+            while "{{{" in line or "}}}" in line:
+                if "{{{" in line:
+                    code_block = True
+                    line = line.replace("{{{", "```")
+                if "}}}" in line:
+                    code_block = False
+                    line = line.replace("}}}", "```")
+            if not code_block:
+
+                # Convert tables.  References:
+                #   https://github.github.com/gfm/#tables-extension-
+                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+                # Table start: line containing "||"; table end: blank line?
+                #
+                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
+                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
+                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
+                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
+                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
+                #
+                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
+                # structure to something markdown will believe.
+
+                if line.strip().startswith("||"):
+                    line = line.replace("=|", "|").replace("|=", "|")
+                    line = line.replace("||", "|")
+                    if not in_table:
+                        tail.append("|---" * (line.count("|") - 1) + "|\n")
+                    in_table = True
+                elif in_table and not line.strip().startswith("||"):
+                    new_content.append("\n")
+                    in_table = False
+
+                #
+                # Convert bullet lists.  The start and end of a list needs an empty line.
+                #
+                nested_line = line.lstrip(' ')
+                if nested_line.startswith('- ') or nested_line.startswith('* '):
+                    if not in_list:
+                        new_content.append("\n")
+                        nested_level = 0
+                        prev_indent = 0
+                        in_list = True
+                    indent = len(line) - len(nested_line)
+                    if indent > prev_indent:
+                        nested_level += 1
+                    elif indent < prev_indent:
+                        nested_level -= 1
+                    prev_indent = indent
+                    line = '    ' * nested_level + nested_line
+                elif in_list:
+                    new_content.append("\n")
+                    in_list = False
+                    nested_level = 0
+                    prev_indent = 0
+
+                # Convert headers
+                line = self.convert_headers(line)
+
+                # Rest is regexp-driven conversions
+                for pattern, action in self.pattern_actions:
+                    line = pattern.sub(action, line)
+
+            new_content.append(line)
+            new_content.extend(tail)
 
-        new_content.append(line)
-        new_content.extend(tail)
+        del self.slug
 
-    return "".join(new_content)
+        return "".join(new_content)



More information about the Commits mailing list