Martin Moene | 61280e6 | 2017-08-29 21:34:43 +0200 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | # |
| 4 | # updateDocumentToC.py |
| 5 | # |
| 6 | # Insert table of contents at top of Catch markdown documents. |
| 7 | # |
| 8 | # This script is distributed under the GNU General Public License v3.0 |
| 9 | # |
| 10 | # It is based on markdown-toclify version 1.7.1 by Sebastian Raschka, |
| 11 | # https://github.com/rasbt/markdown-toclify |
| 12 | # |
| 13 | |
| 14 | from __future__ import print_function |
| 15 | from scriptCommon import catchPath |
| 16 | |
| 17 | import argparse |
| 18 | import glob |
| 19 | import os |
| 20 | import re |
| 21 | import sys |
| 22 | |
| 23 | # Configuration: |
| 24 | |
| 25 | minTocEntries = 4 |
| 26 | |
| 27 | headingExcludeDefault = [1,3,4,5] # use level 2 headers for at default |
| 28 | headingExcludeRelease = [2,3,4,5] # use level 1 headers for release-notes.md |
| 29 | |
| 30 | documentsDefault = os.path.join(os.path.relpath(catchPath), 'docs/*.md') |
| 31 | releaseNotesName = 'release-notes.md' |
| 32 | |
Martin Hořeňovský | 276393e | 2017-10-13 11:14:37 +0200 | [diff] [blame] | 33 | contentTitle = '**Contents**' |
Martin Moene | 61280e6 | 2017-08-29 21:34:43 +0200 | [diff] [blame] | 34 | contentLineNo = 4 |
| 35 | contentLineNdx = contentLineNo - 1 |
| 36 | |
| 37 | # End configuration |
| 38 | |
| 39 | VALIDS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-&' |
| 40 | |
| 41 | def readLines(in_file): |
| 42 | """Returns a list of lines from a input markdown file.""" |
| 43 | |
| 44 | with open(in_file, 'r') as inf: |
| 45 | in_contents = inf.read().split('\n') |
| 46 | return in_contents |
| 47 | |
| 48 | def removeLines(lines, remove=('[[back to top]', '<a class="mk-toclify"')): |
| 49 | """Removes existing [back to top] links and <a id> tags.""" |
| 50 | |
| 51 | if not remove: |
| 52 | return lines[:] |
| 53 | |
| 54 | out = [] |
| 55 | for l in lines: |
| 56 | if l.startswith(remove): |
| 57 | continue |
| 58 | out.append(l) |
| 59 | return out |
| 60 | |
| 61 | def removeToC(lines): |
| 62 | """Removes existing table of contents starting at index contentLineNdx.""" |
| 63 | if not lines[contentLineNdx ].startswith(contentTitle): |
| 64 | return lines[:] |
| 65 | |
| 66 | result_top = lines[:contentLineNdx] |
| 67 | |
| 68 | pos = contentLineNdx + 1 |
| 69 | while lines[pos].startswith('['): |
| 70 | pos = pos + 1 |
| 71 | |
| 72 | result_bottom = lines[pos + 1:] |
| 73 | |
| 74 | return result_top + result_bottom |
| 75 | |
| 76 | def dashifyHeadline(line): |
| 77 | """ |
| 78 | Takes a header line from a Markdown document and |
| 79 | returns a tuple of the |
| 80 | '#'-stripped version of the head line, |
| 81 | a string version for <a id=''></a> anchor tags, |
| 82 | and the level of the headline as integer. |
| 83 | E.g., |
| 84 | >>> dashifyHeadline('### some header lvl3') |
| 85 | ('Some header lvl3', 'some-header-lvl3', 3) |
| 86 | |
| 87 | """ |
| 88 | stripped_right = line.rstrip('#') |
| 89 | stripped_both = stripped_right.lstrip('#') |
| 90 | level = len(stripped_right) - len(stripped_both) |
| 91 | stripped_wspace = stripped_both.strip() |
| 92 | |
| 93 | # character replacements |
| 94 | replaced_colon = stripped_wspace.replace('.', '') |
| 95 | replaced_slash = replaced_colon.replace('/', '') |
| 96 | rem_nonvalids = ''.join([c if c in VALIDS |
| 97 | else '-' for c in replaced_slash]) |
| 98 | |
| 99 | lowered = rem_nonvalids.lower() |
| 100 | dashified = re.sub(r'(-)\1+', r'\1', lowered) # remove duplicate dashes |
| 101 | dashified = dashified.strip('-') # strip dashes from start and end |
| 102 | |
| 103 | # exception '&' (double-dash in github) |
| 104 | dashified = dashified.replace('-&-', '--') |
| 105 | |
| 106 | return [stripped_wspace, dashified, level] |
| 107 | |
| 108 | def tagAndCollect(lines, id_tag=True, back_links=False, exclude_h=None): |
| 109 | """ |
| 110 | Gets headlines from the markdown document and creates anchor tags. |
| 111 | |
| 112 | Keyword arguments: |
| 113 | lines: a list of sublists where every sublist |
| 114 | represents a line from a Markdown document. |
| 115 | id_tag: if true, creates inserts a the <a id> tags (not req. by GitHub) |
| 116 | back_links: if true, adds "back to top" links below each headline |
| 117 | exclude_h: header levels to exclude. E.g., [2, 3] |
| 118 | excludes level 2 and 3 headings. |
| 119 | |
| 120 | Returns a tuple of 2 lists: |
| 121 | 1st list: |
| 122 | A modified version of the input list where |
| 123 | <a id="some-header"></a> anchor tags where inserted |
| 124 | above the header lines (if github is False). |
| 125 | |
| 126 | 2nd list: |
| 127 | A list of 3-value sublists, where the first value |
| 128 | represents the heading, the second value the string |
| 129 | that was inserted assigned to the IDs in the anchor tags, |
| 130 | and the third value is an integer that reprents the headline level. |
| 131 | E.g., |
| 132 | [['some header lvl3', 'some-header-lvl3', 3], ...] |
| 133 | |
| 134 | """ |
| 135 | out_contents = [] |
| 136 | headlines = [] |
| 137 | for l in lines: |
| 138 | saw_headline = False |
| 139 | |
| 140 | orig_len = len(l) |
| 141 | l_stripped = l.lstrip() |
| 142 | |
| 143 | if l_stripped.startswith(('# ', '## ', '### ', '#### ', '##### ', '###### ')): |
| 144 | |
| 145 | # comply with new markdown standards |
| 146 | |
| 147 | # not a headline if '#' not followed by whitespace '##no-header': |
| 148 | if not l.lstrip('#').startswith(' '): |
| 149 | continue |
| 150 | # not a headline if more than 6 '#': |
| 151 | if len(l) - len(l.lstrip('#')) > 6: |
| 152 | continue |
| 153 | # headers can be indented by at most 3 spaces: |
| 154 | if orig_len - len(l_stripped) > 3: |
| 155 | continue |
| 156 | |
| 157 | # ignore empty headers |
| 158 | if not set(l) - {'#', ' '}: |
| 159 | continue |
| 160 | |
| 161 | saw_headline = True |
| 162 | dashified = dashifyHeadline(l) |
| 163 | |
| 164 | if not exclude_h or not dashified[-1] in exclude_h: |
| 165 | if id_tag: |
| 166 | id_tag = '<a class="mk-toclify" id="%s"></a>'\ |
| 167 | % (dashified[1]) |
| 168 | out_contents.append(id_tag) |
| 169 | headlines.append(dashified) |
| 170 | |
| 171 | out_contents.append(l) |
| 172 | if back_links and saw_headline: |
| 173 | out_contents.append('[[back to top](#table-of-contents)]') |
| 174 | return out_contents, headlines |
| 175 | |
| 176 | def positioningHeadlines(headlines): |
| 177 | """ |
| 178 | Strips unnecessary whitespaces/tabs if first header is not left-aligned |
| 179 | """ |
| 180 | left_just = False |
| 181 | for row in headlines: |
| 182 | if row[-1] == 1: |
| 183 | left_just = True |
| 184 | break |
| 185 | if not left_just: |
| 186 | for row in headlines: |
| 187 | row[-1] -= 1 |
| 188 | return headlines |
| 189 | |
| 190 | def createToc(headlines, hyperlink=True, top_link=False, no_toc_header=False): |
| 191 | """ |
| 192 | Creates the table of contents from the headline list |
| 193 | that was returned by the tagAndCollect function. |
| 194 | |
| 195 | Keyword Arguments: |
| 196 | headlines: list of lists |
| 197 | e.g., ['Some header lvl3', 'some-header-lvl3', 3] |
| 198 | hyperlink: Creates hyperlinks in Markdown format if True, |
| 199 | e.g., '- [Some header lvl1](#some-header-lvl1)' |
| 200 | top_link: if True, add a id tag for linking the table |
| 201 | of contents itself (for the back-to-top-links) |
| 202 | no_toc_header: suppresses TOC header if True. |
| 203 | |
| 204 | Returns a list of headlines for a table of contents |
| 205 | in Markdown format, |
| 206 | e.g., [' - [Some header lvl3](#some-header-lvl3)', ...] |
| 207 | |
| 208 | """ |
| 209 | processed = [] |
| 210 | if not no_toc_header: |
| 211 | if top_link: |
| 212 | processed.append('<a class="mk-toclify" id="table-of-contents"></a>\n') |
Martin Hořeňovský | 276393e | 2017-10-13 11:14:37 +0200 | [diff] [blame] | 213 | processed.append(contentTitle + '<br>') |
Martin Moene | 61280e6 | 2017-08-29 21:34:43 +0200 | [diff] [blame] | 214 | |
| 215 | for line in headlines: |
| 216 | if hyperlink: |
Martin Hořeňovský | 276393e | 2017-10-13 11:14:37 +0200 | [diff] [blame] | 217 | item = '[%s](#%s)' % (line[0], line[1]) |
Martin Moene | 61280e6 | 2017-08-29 21:34:43 +0200 | [diff] [blame] | 218 | else: |
| 219 | item = '%s- %s' % ((line[2]-1)*' ', line[0]) |
Martin Hořeňovský | 276393e | 2017-10-13 11:14:37 +0200 | [diff] [blame] | 220 | processed.append(item + '<br>') |
Martin Moene | 61280e6 | 2017-08-29 21:34:43 +0200 | [diff] [blame] | 221 | processed.append('\n') |
| 222 | return processed |
| 223 | |
| 224 | def buildMarkdown(toc_headlines, body, spacer=0, placeholder=None): |
| 225 | """ |
| 226 | Returns a string with the Markdown output contents incl. |
| 227 | the table of contents. |
| 228 | |
| 229 | Keyword arguments: |
| 230 | toc_headlines: lines for the table of contents |
| 231 | as created by the createToc function. |
| 232 | body: contents of the Markdown file including |
| 233 | ID-anchor tags as returned by the |
| 234 | tagAndCollect function. |
| 235 | spacer: Adds vertical space after the table |
| 236 | of contents. Height in pixels. |
| 237 | placeholder: If a placeholder string is provided, the placeholder |
| 238 | will be replaced by the TOC instead of inserting the TOC at |
| 239 | the top of the document |
| 240 | |
| 241 | """ |
| 242 | if spacer: |
| 243 | spacer_line = ['\n<div style="height:%spx;"></div>\n' % (spacer)] |
| 244 | toc_markdown = "\n".join(toc_headlines + spacer_line) |
| 245 | else: |
| 246 | toc_markdown = "\n".join(toc_headlines) |
| 247 | |
| 248 | if placeholder: |
| 249 | body_markdown = "\n".join(body) |
| 250 | markdown = body_markdown.replace(placeholder, toc_markdown) |
| 251 | else: |
| 252 | body_markdown_p1 = "\n".join(body[:contentLineNdx ]) + '\n' |
| 253 | body_markdown_p2 = "\n".join(body[ contentLineNdx:]) |
| 254 | markdown = body_markdown_p1 + toc_markdown + body_markdown_p2 |
| 255 | |
| 256 | return markdown |
| 257 | |
| 258 | def outputMarkdown(markdown_cont, output_file): |
| 259 | """ |
| 260 | Writes to an output file if `outfile` is a valid path. |
| 261 | |
| 262 | """ |
| 263 | if output_file: |
| 264 | with open(output_file, 'w') as out: |
| 265 | out.write(markdown_cont) |
| 266 | |
| 267 | def markdownToclify( |
| 268 | input_file, |
| 269 | output_file=None, |
| 270 | min_toc_len=2, |
| 271 | github=False, |
| 272 | back_to_top=False, |
| 273 | nolink=False, |
| 274 | no_toc_header=False, |
| 275 | spacer=0, |
| 276 | placeholder=None, |
| 277 | exclude_h=None): |
| 278 | """ Function to add table of contents to markdown files. |
| 279 | |
| 280 | Parameters |
| 281 | ----------- |
| 282 | input_file: str |
| 283 | Path to the markdown input file. |
| 284 | |
| 285 | output_file: str (defaul: None) |
| 286 | Path to the markdown output file. |
| 287 | |
| 288 | min_toc_len: int (default: 2) |
| 289 | Miniumum number of entries to create a table of contents for. |
| 290 | |
| 291 | github: bool (default: False) |
| 292 | Uses GitHub TOC syntax if True. |
| 293 | |
| 294 | back_to_top: bool (default: False) |
| 295 | Inserts back-to-top links below headings if True. |
| 296 | |
| 297 | nolink: bool (default: False) |
| 298 | Creates the table of contents without internal links if True. |
| 299 | |
| 300 | no_toc_header: bool (default: False) |
| 301 | Suppresses the Table of Contents header if True |
| 302 | |
| 303 | spacer: int (default: 0) |
| 304 | Inserts horizontal space (in pixels) after the table of contents. |
| 305 | |
| 306 | placeholder: str (default: None) |
| 307 | Inserts the TOC at the placeholder string instead |
| 308 | of inserting the TOC at the top of the document. |
| 309 | |
| 310 | exclude_h: list (default None) |
| 311 | Excludes header levels, e.g., if [2, 3], ignores header |
| 312 | levels 2 and 3 in the TOC. |
| 313 | |
| 314 | Returns |
| 315 | ----------- |
| 316 | changed: Boolean |
| 317 | True if the file has been updated, False otherwise. |
| 318 | |
| 319 | """ |
| 320 | cleaned_contents = removeLines( |
| 321 | removeToC(readLines(input_file)), |
| 322 | remove=('[[back to top]', '<a class="mk-toclify"')) |
| 323 | |
| 324 | processed_contents, raw_headlines = tagAndCollect( |
| 325 | cleaned_contents, |
| 326 | id_tag=not github, |
| 327 | back_links=back_to_top, |
| 328 | exclude_h=exclude_h) |
| 329 | |
| 330 | # add table of contents? |
| 331 | if len(raw_headlines) < min_toc_len: |
| 332 | processed_headlines = [] |
| 333 | else: |
| 334 | leftjustified_headlines = positioningHeadlines(raw_headlines) |
| 335 | |
| 336 | processed_headlines = createToc( |
| 337 | leftjustified_headlines, |
| 338 | hyperlink=not nolink, |
| 339 | top_link=not nolink and not github, |
| 340 | no_toc_header=no_toc_header) |
| 341 | |
| 342 | if nolink: |
| 343 | processed_contents = cleaned_contents |
| 344 | |
| 345 | cont = buildMarkdown( |
| 346 | toc_headlines=processed_headlines, |
| 347 | body=processed_contents, |
| 348 | spacer=spacer, |
| 349 | placeholder=placeholder) |
| 350 | |
| 351 | if output_file: |
| 352 | outputMarkdown(cont, output_file) |
| 353 | |
| 354 | def isReleaseNotes(f): |
| 355 | return os.path.basename(f) == releaseNotesName |
| 356 | |
| 357 | def excludeHeadingsFor(f): |
| 358 | return headingExcludeRelease if isReleaseNotes(f) else headingExcludeDefault |
| 359 | |
| 360 | def updateSingleDocumentToC(input_file, min_toc_len, verbose=False): |
| 361 | """Add or update table of contents in specified file. Return 1 if file changed, 0 otherwise.""" |
| 362 | if verbose : |
| 363 | print( 'file: {}'.format(input_file)) |
| 364 | |
| 365 | output_file = input_file + '.tmp' |
| 366 | |
| 367 | markdownToclify( |
| 368 | input_file=input_file, |
| 369 | output_file=output_file, |
| 370 | min_toc_len=min_toc_len, |
| 371 | github=True, |
| 372 | back_to_top=False, |
| 373 | nolink=False, |
| 374 | no_toc_header=False, |
| 375 | spacer=False, |
| 376 | placeholder=False, |
| 377 | exclude_h=excludeHeadingsFor(input_file)) |
| 378 | |
| 379 | # prevent race-condition (Python 3.3): |
| 380 | if sys.version_info >= (3, 3): |
| 381 | os.replace(output_file, input_file) |
| 382 | else: |
| 383 | os.remove(input_file) |
| 384 | os.rename(output_file, input_file) |
| 385 | |
| 386 | return 1 |
| 387 | |
| 388 | def updateDocumentToC(paths, min_toc_len, verbose): |
| 389 | """Add or update table of contents to specified paths. Return number of changed files""" |
| 390 | n = 0 |
| 391 | for g in paths: |
| 392 | for f in glob.glob(g): |
| 393 | if os.path.isfile(f): |
| 394 | n = n + updateSingleDocumentToC(input_file=f, min_toc_len=min_toc_len, verbose=verbose) |
| 395 | return n |
| 396 | |
| 397 | def updateDocumentToCMain(): |
| 398 | """Add or update table of contents to specified paths.""" |
| 399 | |
| 400 | parser = argparse.ArgumentParser( |
| 401 | description='Add or update table of contents in markdown documents.', |
| 402 | epilog="""""", |
| 403 | formatter_class=argparse.RawTextHelpFormatter) |
| 404 | |
| 405 | parser.add_argument( |
| 406 | 'Input', |
| 407 | metavar='file', |
| 408 | type=str, |
| 409 | nargs=argparse.REMAINDER, |
| 410 | help='files to process, at default: docs/*.md') |
| 411 | |
| 412 | parser.add_argument( |
| 413 | '-v', '--verbose', |
| 414 | action='store_true', |
| 415 | help='report the name of the file being processed') |
| 416 | |
| 417 | parser.add_argument( |
| 418 | '--min-toc-entries', |
| 419 | dest='minTocEntries', |
| 420 | default=minTocEntries, |
| 421 | type=int, |
| 422 | metavar='N', |
| 423 | help='the minimum number of entries to create a table of contents for [{deflt}]'.format(deflt=minTocEntries)) |
| 424 | |
| 425 | parser.add_argument( |
| 426 | '--remove-toc', |
| 427 | action='store_const', |
| 428 | dest='minTocEntries', |
| 429 | const=99, |
| 430 | help='remove all tables of contents') |
| 431 | |
| 432 | args = parser.parse_args() |
| 433 | |
| 434 | paths = args.Input if len(args.Input) > 0 else [documentsDefault] |
| 435 | |
| 436 | changedFiles = updateDocumentToC(paths=paths, min_toc_len=args.minTocEntries, verbose=args.verbose) |
| 437 | |
| 438 | if changedFiles > 0: |
| 439 | print( "Processed table of contents in " + str(changedFiles) + " file(s)" ) |
| 440 | else: |
| 441 | print( "No table of contents added or updated" ) |
| 442 | |
| 443 | if __name__ == '__main__': |
| 444 | updateDocumentToCMain() |
| 445 | |
| 446 | # end of file |