Rename to updateDocumentToC.py and adapt for use with Catch

adding missing GPL 3.0 license (thanks for noting @horenmar).
2025-05-25 22:19:25 +00:00 · 2017-08-29 21:34:43 +02:00 · 2017-08-29 21:34:43 +02:00 · 61280e6d0a
commit 61280e6d0a
parent 7e9b53e40c
1 changed files with 182 additions and 136 deletions
--- a/scripts/updateDocumentToC.py
+++ b/scripts/updateDocumentToC.py
@ -0,0 +1,446 @@
+#!/usr/bin/env python
+
+#
+# updateDocumentToC.py
+#
+# Insert table of contents at top of Catch markdown documents.
+#
+# This script is distributed under the GNU General Public License v3.0
+#
+# It is based on markdown-toclify version 1.7.1 by Sebastian Raschka,
+# https://github.com/rasbt/markdown-toclify
+#
+
+from  __future__  import print_function
+from scriptCommon import catchPath
+
+import argparse
+import glob
+import os
+import re
+import sys
+
+# Configuration:
+
+minTocEntries = 4
+
+headingExcludeDefault = [1,3,4,5]  # use level 2 headers for at default
+headingExcludeRelease = [2,3,4,5]  # use level 1 headers for release-notes.md
+
+documentsDefault = os.path.join(os.path.relpath(catchPath), 'docs/*.md')
+releaseNotesName = 'release-notes.md'
+
+contentTitle = '**Contents**  '
+contentLineNo = 4
+contentLineNdx = contentLineNo - 1
+
+# End configuration
+
+VALIDS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-&'
+
+def readLines(in_file):
+    """Returns a list of lines from a input markdown file."""
+
+    with open(in_file, 'r') as inf:
+        in_contents = inf.read().split('\n')
+    return in_contents
+
+def removeLines(lines, remove=('[[back to top]', '<a class="mk-toclify"')):
+    """Removes existing [back to top] links and <a id> tags."""
+
+    if not remove:
+        return lines[:]
+
+    out = []
+    for l in lines:
+        if l.startswith(remove):
+            continue
+        out.append(l)
+    return out
+
+def removeToC(lines):
+    """Removes existing table of contents starting at index contentLineNdx."""
+    if not lines[contentLineNdx ].startswith(contentTitle):
+        return lines[:]
+
+    result_top = lines[:contentLineNdx]
+
+    pos = contentLineNdx + 1
+    while lines[pos].startswith('['):
+        pos = pos + 1
+
+    result_bottom = lines[pos + 1:]
+
+    return result_top + result_bottom
+
+def dashifyHeadline(line):
+    """
+    Takes a header line from a Markdown document and
+    returns a tuple of the
+        '#'-stripped version of the head line,
+        a string version for <a id=''></a> anchor tags,
+        and the level of the headline as integer.
+    E.g.,
+    >>> dashifyHeadline('### some header lvl3')
+    ('Some header lvl3', 'some-header-lvl3', 3)
+
+    """
+    stripped_right = line.rstrip('#')
+    stripped_both = stripped_right.lstrip('#')
+    level = len(stripped_right) - len(stripped_both)
+    stripped_wspace = stripped_both.strip()
+
+    # character replacements
+    replaced_colon = stripped_wspace.replace('.', '')
+    replaced_slash = replaced_colon.replace('/', '')
+    rem_nonvalids = ''.join([c if c in VALIDS
+                             else '-' for c in replaced_slash])
+
+    lowered = rem_nonvalids.lower()
+    dashified = re.sub(r'(-)\1+', r'\1', lowered)  # remove duplicate dashes
+    dashified = dashified.strip('-')  # strip dashes from start and end
+
+    # exception '&' (double-dash in github)
+    dashified = dashified.replace('-&-', '--')
+
+    return [stripped_wspace, dashified, level]
+
+def tagAndCollect(lines, id_tag=True, back_links=False, exclude_h=None):
+    """
+    Gets headlines from the markdown document and creates anchor tags.
+
+    Keyword arguments:
+        lines: a list of sublists where every sublist
+            represents a line from a Markdown document.
+        id_tag: if true, creates inserts a the <a id> tags (not req. by GitHub)
+        back_links: if true, adds "back to top" links below each headline
+        exclude_h: header levels to exclude. E.g., [2, 3]
+            excludes level 2 and 3 headings.
+
+    Returns a tuple of 2 lists:
+        1st list:
+            A modified version of the input list where
+            <a id="some-header"></a> anchor tags where inserted
+            above the header lines (if github is False).
+
+        2nd list:
+            A list of 3-value sublists, where the first value
+            represents the heading, the second value the string
+            that was inserted assigned to the IDs in the anchor tags,
+            and the third value is an integer that reprents the headline level.
+            E.g.,
+            [['some header lvl3', 'some-header-lvl3', 3], ...]
+
+    """
+    out_contents = []
+    headlines = []
+    for l in lines:
+        saw_headline = False
+
+        orig_len = len(l)
+        l_stripped = l.lstrip()
+
+        if l_stripped.startswith(('# ', '## ', '### ', '#### ', '##### ', '###### ')):
+
+            # comply with new markdown standards
+
+            # not a headline if '#' not followed by whitespace '##no-header':
+            if not l.lstrip('#').startswith(' '):
+                continue
+            # not a headline if more than 6 '#':
+            if len(l) - len(l.lstrip('#')) > 6:
+                continue
+            # headers can be indented by at most 3 spaces:
+            if orig_len - len(l_stripped) > 3:
+                continue
+
+            # ignore empty headers
+            if not set(l) - {'#', ' '}:
+                continue
+
+            saw_headline = True
+            dashified = dashifyHeadline(l)
+
+            if not exclude_h or not dashified[-1] in exclude_h:
+                if id_tag:
+                    id_tag = '<a class="mk-toclify" id="%s"></a>'\
+                              % (dashified[1])
+                    out_contents.append(id_tag)
+                headlines.append(dashified)
+
+        out_contents.append(l)
+        if back_links and saw_headline:
+            out_contents.append('[[back to top](#table-of-contents)]')
+    return out_contents, headlines
+
+def positioningHeadlines(headlines):
+    """
+    Strips unnecessary whitespaces/tabs if first header is not left-aligned
+    """
+    left_just = False
+    for row in headlines:
+        if row[-1] == 1:
+            left_just = True
+            break
+    if not left_just:
+        for row in headlines:
+            row[-1] -= 1
+    return headlines
+
+def createToc(headlines, hyperlink=True, top_link=False, no_toc_header=False):
+    """
+    Creates the table of contents from the headline list
+    that was returned by the tagAndCollect function.
+
+    Keyword Arguments:
+        headlines: list of lists
+            e.g., ['Some header lvl3', 'some-header-lvl3', 3]
+        hyperlink: Creates hyperlinks in Markdown format if True,
+            e.g., '- [Some header lvl1](#some-header-lvl1)'
+        top_link: if True, add a id tag for linking the table
+            of contents itself (for the back-to-top-links)
+        no_toc_header: suppresses TOC header if True.
+
+    Returns  a list of headlines for a table of contents
+    in Markdown format,
+    e.g., ['        - [Some header lvl3](#some-header-lvl3)', ...]
+
+    """
+    processed = []
+    if not no_toc_header:
+        if top_link:
+            processed.append('<a class="mk-toclify" id="table-of-contents"></a>\n')
+        processed.append(contentTitle)
+
+    for line in headlines:
+        if hyperlink:
+            item = '[%s](#%s)  ' % (line[0], line[1])
+        else:
+            item = '%s- %s' % ((line[2]-1)*'    ', line[0])
+        processed.append(item)
+    processed.append('\n')
+    return processed
+
+def buildMarkdown(toc_headlines, body, spacer=0, placeholder=None):
+    """
+    Returns a string with the Markdown output contents incl.
+    the table of contents.
+
+    Keyword arguments:
+        toc_headlines: lines for the table of contents
+            as created by the createToc function.
+        body: contents of the Markdown file including
+            ID-anchor tags as returned by the
+            tagAndCollect function.
+        spacer: Adds vertical space after the table
+            of contents. Height in pixels.
+        placeholder: If a placeholder string is provided, the placeholder
+            will be replaced by the TOC instead of inserting the TOC at
+            the top of the document
+
+    """
+    if spacer:
+        spacer_line = ['\n<div style="height:%spx;"></div>\n' % (spacer)]
+        toc_markdown = "\n".join(toc_headlines + spacer_line)
+    else:
+        toc_markdown = "\n".join(toc_headlines)
+
+    if placeholder:
+        body_markdown = "\n".join(body)
+        markdown = body_markdown.replace(placeholder, toc_markdown)
+    else:
+        body_markdown_p1 = "\n".join(body[:contentLineNdx ]) + '\n'
+        body_markdown_p2 = "\n".join(body[ contentLineNdx:])
+        markdown = body_markdown_p1 + toc_markdown + body_markdown_p2
+
+    return markdown
+
+def outputMarkdown(markdown_cont, output_file):
+    """
+    Writes to an output file if `outfile` is a valid path.
+
+    """
+    if output_file:
+        with open(output_file, 'w') as out:
+            out.write(markdown_cont)
+
+def markdownToclify(
+    input_file,
+    output_file=None,
+    min_toc_len=2,
+    github=False,
+    back_to_top=False,
+    nolink=False,
+    no_toc_header=False,
+    spacer=0,
+    placeholder=None,
+    exclude_h=None):
+    """ Function to add table of contents to markdown files.
+
+    Parameters
+    -----------
+      input_file: str
+        Path to the markdown input file.
+
+      output_file: str (defaul: None)
+        Path to the markdown output file.
+
+      min_toc_len: int (default: 2)
+        Miniumum number of entries to create a table of contents for.
+
+      github: bool (default: False)
+        Uses GitHub TOC syntax if True.
+
+      back_to_top: bool (default: False)
+        Inserts back-to-top links below headings if True.
+
+      nolink: bool (default: False)
+        Creates the table of contents without internal links if True.
+
+      no_toc_header: bool (default: False)
+        Suppresses the Table of Contents header if True
+
+      spacer: int (default: 0)
+        Inserts horizontal space (in pixels) after the table of contents.
+
+      placeholder: str (default: None)
+        Inserts the TOC at the placeholder string instead
+        of inserting the TOC at the top of the document.
+
+      exclude_h: list (default None)
+        Excludes header levels, e.g., if [2, 3], ignores header
+        levels 2 and 3 in the TOC.
+
+    Returns
+    -----------
+    changed: Boolean
+      True if the file has been updated, False otherwise.
+
+    """
+    cleaned_contents = removeLines(
+        removeToC(readLines(input_file)),
+        remove=('[[back to top]', '<a class="mk-toclify"'))
+
+    processed_contents, raw_headlines = tagAndCollect(
+        cleaned_contents,
+        id_tag=not github,
+        back_links=back_to_top,
+        exclude_h=exclude_h)
+
+    # add table of contents?
+    if len(raw_headlines) < min_toc_len:
+        processed_headlines = []
+    else:
+        leftjustified_headlines = positioningHeadlines(raw_headlines)
+
+        processed_headlines = createToc(
+            leftjustified_headlines,
+            hyperlink=not nolink,
+            top_link=not nolink and not github,
+            no_toc_header=no_toc_header)
+
+    if nolink:
+        processed_contents = cleaned_contents
+
+    cont = buildMarkdown(
+        toc_headlines=processed_headlines,
+        body=processed_contents,
+        spacer=spacer,
+        placeholder=placeholder)
+
+    if output_file:
+        outputMarkdown(cont, output_file)
+
+def isReleaseNotes(f):
+    return os.path.basename(f) == releaseNotesName
+
+def excludeHeadingsFor(f):
+    return headingExcludeRelease if isReleaseNotes(f) else headingExcludeDefault
+
+def updateSingleDocumentToC(input_file, min_toc_len, verbose=False):
+    """Add or update table of contents in specified file. Return 1 if file changed, 0 otherwise."""
+    if verbose :
+        print( 'file: {}'.format(input_file))
+
+    output_file = input_file + '.tmp'
+
+    markdownToclify(
+        input_file=input_file,
+        output_file=output_file,
+        min_toc_len=min_toc_len,
+        github=True,
+        back_to_top=False,
+        nolink=False,
+        no_toc_header=False,
+        spacer=False,
+        placeholder=False,
+        exclude_h=excludeHeadingsFor(input_file))
+
+    # prevent race-condition (Python 3.3):
+    if sys.version_info >= (3, 3):
+        os.replace(output_file, input_file)
+    else:
+        os.remove(input_file)
+        os.rename(output_file, input_file)
+
+    return 1
+
+def updateDocumentToC(paths, min_toc_len, verbose):
+    """Add or update table of contents to specified paths. Return number of changed files"""
+    n = 0
+    for g in paths:
+        for f in glob.glob(g):
+            if os.path.isfile(f):
+                n = n + updateSingleDocumentToC(input_file=f, min_toc_len=min_toc_len, verbose=verbose)
+    return n
+
+def updateDocumentToCMain():
+    """Add or update table of contents to specified paths."""
+
+    parser = argparse.ArgumentParser(
+        description='Add or update table of contents in markdown documents.',
+        epilog="""""",
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        'Input',
+        metavar='file',
+        type=str,
+        nargs=argparse.REMAINDER,
+        help='files to process, at default: docs/*.md')
+
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='report the name of the file being processed')
+
+    parser.add_argument(
+        '--min-toc-entries',
+        dest='minTocEntries',
+        default=minTocEntries,
+        type=int,
+        metavar='N',
+        help='the minimum number of entries to create a table of contents for [{deflt}]'.format(deflt=minTocEntries))
+
+    parser.add_argument(
+        '--remove-toc',
+        action='store_const',
+        dest='minTocEntries',
+        const=99,
+        help='remove all tables of contents')
+
+    args = parser.parse_args()
+
+    paths = args.Input if len(args.Input) > 0 else [documentsDefault]
+
+    changedFiles = updateDocumentToC(paths=paths, min_toc_len=args.minTocEntries, verbose=args.verbose)
+
+    if changedFiles > 0:
+        print( "Processed table of contents in " + str(changedFiles) + " file(s)" )
+    else:
+        print( "No table of contents added or updated" )
+
+if __name__ == '__main__':
+    updateDocumentToCMain()
+
+# end of file