parsinglists not working

juangpc · juangpc · commit bcd13bdf6a4e · 2021-05-26T18:24:00.000-04:00
diff --git a/tools/python/demo_recursive_folder_process.py b/tools/python/demo_recursive_folder_process.py
@@ -1,18 +1,18 @@
-import mne_cpp.core
+import mne_cpp.core as mne
 # import mne_cpp.pdf_doc
 from os import stat
 
-mne_cpp.core.version()
+mne.version()
 
-projectFolder = mne_cpp.core.baseFolder()
+projectFolder = mne.baseFolder()
 
 #  Recursively list all the files in a directory and order by size and print results.
 listOfFiles = []
-mne_cpp.core.recursiveFolderProcess(projectFolder + 'doc/gh-pages', lambda f:                     \
-                                                                        listOfFiles.append((f, stat(f).st_size))    \
-                                                                        if f.name.endswith('.md')  \
-                                                                        else None )
+mne.recursiveFolderProcess(projectFolder + 'doc/gh-pages', lambda f:                     \
+                                                            listOfFiles.append((f, stat(f).st_size))    \
+                                                            if f.name.endswith('.md')  \
+                                                            else None )
 listOfFiles.sort(reverse=True, key=lambda f:f[1])
 for f in listOfFiles:
-    print('File: ' + f[0].path + ' - (' + mne_cpp.core.sizeHumanReadable(f[1]) + ')')
+    print('File: ' + f[0].path + ' - (' + mne.sizeHumanReadable(f[1]) + ')')
 
diff --git a/tools/python/documentation_pdf_generator.py b/tools/python/documentation_pdf_generator.py
@@ -1,25 +1,93 @@
 import mne_cpp.core
-import mne_cpp.pdf_doc
+import mne_cpp.pdf_doc as mnepdf
 
 projectFolder = mne_cpp.core.baseFolder()
 webBaseFolder = projectFolder + 'doc/gh-pages'
 
-# webDocuments = mne_cpp.pdf_doc.scanFolder(webBaseFolder)
+# webDocuments = mnepdf.scanFolder(webBaseFolder)
 # print(webDocuments)
 
-# web = mne_cpp.pdf_doc.buildWebStructure(webDocuments)
+# web = mnepdf.buildWebStructure(webDocuments)
 # print('Printing Web Structure:')
 # print(web)
 
 # (pathLabel, filePath, fileName, fileExt, fullPath) = mne_cpp.core.extractFilePaths('../../doc/gh-pages/pages/documentation/anonymize.md')
-(pathLabel, filePath, fileName, fileExt, fullPath) = mne_cpp.core.extractFilePaths('../../doc/gh-pages/pages/contact.md')
+# (pathLabel, filePath, fileName, fileExt, fullPath) = mne_cpp.core.extractFilePaths('../../doc/gh-pages/pages/contact.md')
+
+# inFile = open(fullPath, mode = 'r', encoding = 'utf8')
+# inText = inFile.read()
+# inFile.close()
+
+# outText = mnepdf.parseUnorderedList(inText)
+# outFile = open(pathLabel + filePath + fileName + '.PROCESSED' + '.' + fileExt, mode = 'w', encoding = 'utf8')
+# outFile.write(outText)
+# outFile.close()
+
+inText = r'''
+---
+layout: default
+title: Markdown kitchen sink
+nav_order: 99
+---
+(\n(( *[-*] *)|(\s*\d+\.\s*))[^\-*\n ].+)+
+(\n(( *[-*] *)|( *\d+\. *))[^\-*\n ].+)+
+(\n(( *[-*] *)|( *\d+\. *))[^\-*\n ].+)+
+Text can be **bold**, _italic_, or ~~strikethrough~~.
+
+[Link to another page](another-page).
+
+There should be whitespace between paragraphs.
+
+There should be whitespace between paragraphs. We recommend including a README, or a file with information about your project.
+
+# [](#header-1)Header 1
+
+This is a normal paragraph following a header. GitHub is a code hosting platform for version control and collaboration. It lets you and others work together on projects from anywhere.
+
+## [](#header-2)Header 2
+
+> This is a blockquote following a header.
+>
+> When something is important enough, you do it even if the odds are not in your favor.
+
+### [](#header-3)Header 3
+
+```js
+// Javascript code with syntax highlighting.
+var fun = function lang(l) {
+  dateformat.i18n = require('./lang/' + l)
+  return true;
+}
+```
+
+```ruby
+# Ruby code with syntax highlighting
+GitHubPages::Dependencies.gems.each do |gem, version|
+  s.add_dependency(gem, "= #{version}")
+end
+```
+- level 1 item
+  - level 2 item
+  - level 2 item
+    - level 3 item
+    - level 3 item
+- level 1 item
+  - level 2 item
+  - level 2 item
+  - level 2 item
+- level 1 item
+  - level 2 item
+  - level 2 item
+- level 1 item
+'''
+
+outText = mnepdf.parseLists(inText)
+
+
+
+
+
+a = 3
 
-inFile = open(fullPath, mode = 'r', encoding = 'utf8')
-inText = inFile.read()
-inFile.close()
 
-outText = mne_cpp.pdf_doc.parseUnorderedList(inText)
-outFile = open(pathLabel + filePath + fileName + '.PROCESSED' + '.' + fileExt, mode = 'w', encoding = 'utf8')
-outFile.write(outText)
-outFile.close()
 
diff --git a/tools/python/list_trash_text1.txt b/tools/python/list_trash_text1.txt
@@ -0,0 +1,15 @@
+\begin{itemize}
+	\item level 1 item
+  - level 2 item
+  - level 2 item
+    - level 3 item
+    - level 3 item
+	\item level 1 item
+  - level 2 item
+  - level 2 item
+  - level 2 item
+	\item level 1 item
+  - level 2 item
+  - level 2 item
+	\item level 1 item
+\end{itemize}
diff --git a/tools/python/list_trash_text2.txt b/tools/python/list_trash_text2.txt
@@ -0,0 +1,26 @@
+\begin{itemize}
+	\item level 1 item
+	\begin{itemize}
+		\item level 2 item
+		\item level 2 item
+		\begin{itemize}
+			\item level 3 item
+			\item level 3 item
+		\end{itemize}
+	\end{itemize}
+	\item level 1 item
+	\begin{itemize}
+		\item level 2 item
+		\item level 2 item
+		\item level 2 item
+	\end{itemize}
+	\item level 1 item
+	\begin{itemize}
+		\item level 2 item
+		\item level 2 item
+	\end{itemize}
+	\item level 1 item
+\end{itemize}
+ 
+ 
+ 
diff --git a/tools/python/mne_cpp/__pycache__/core.cpython-39.pyc b/tools/python/mne_cpp/__pycache__/core.cpython-39.pyc
diff --git a/tools/python/mne_cpp/__pycache__/pdf_doc.cpython-39.pyc b/tools/python/mne_cpp/__pycache__/pdf_doc.cpython-39.pyc
diff --git a/tools/python/mne_cpp/core.py b/tools/python/mne_cpp/core.py
@@ -158,8 +158,8 @@ def parseInputArguments(argsToParse, **opts):
             options[arg_adapted] = argsToParse[arg]
     return (v for k, v in options.items())
 
-_suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
 def sizeHumanReadable(size):
+    _suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
     # determine binary order in steps of size 10 
     # (coerce to int, // still returns a float)
     order = int(log2(size) / 10) if size else 0
diff --git a/tools/python/mne_cpp/pdf_doc.py b/tools/python/mne_cpp/pdf_doc.py
@@ -190,8 +190,14 @@ def parseMarkDownFile(file, **inputArgs):
     else:
         with open(file.fullPath, 'r', encoding='utf8') as markDownFile, \
              open(texFile,'a+') as texFile:
+            # The order here is relevant. Some of the regex depend on not having conflicting patterns. 
+            # i.e. empty lines can sometimes interfere with some lists patterns
+            # i.e.2 horizontal lines (\n* * *) pattern can sometimes be understood as a list. 
+            # I've tried to minimize these conflicts but I'm not 100% sure. So any change should be tested...
             inText = markDownFile.read()
+            inText = stripEmptyLines(inText)
             inText = deleteJustTheDocsHeader(inText)
+            inText = parseHorizontalLines(inText)
             inText = parseInlineItalicText(inText)
             inText = parseInlineBoldText(inText)
             inText = parseUnorderedList(inText)
@@ -211,25 +217,11 @@ def parseInlineItalicText(inText):
 def parseInlineBoldText(inText):
     return re.sub(r'(?<=\W)((?P<dstar>\*\*)|__)(?P<btext>[\w ]+)((?(dstar)\*\*)|__)(?=\W)',r'\\textbf{\g<btext>}', inText)
 
-def parseUnorderedList(inText):
-    match = re.search(r'(\n\s?\*\s?.+)(\n\s?\*\s?(.+))*', inText)
-    if match:
-        outList = '\n\\begin{itemize}\n'
-        pattern2 = re.compile(r'\n*\s*\*\s*(?P<item>.+)(?=\n)?')
-        itemList = pattern2.finditer(match.group(0))
-        for item in itemList:
-            outList += '\t\\item ' + item.group('item') + '\n'
-        outList += '\\end{itemize}'
-        outText = inText[:match.start(0)] + outList + inText[match.end(0):]
-        return parseUnorderedList(outText)
-    else:
-        return inText
-
 def parseInlineImages(inText):
     match = re.search(r'!\[(?P<alt_text>[^]]+)\]\((?P<imgFilePath>[^)]+)\)', inText)
     if match:
-        imgPath = mne_cpp.core.none_if_empty(match.group('imgFilePath'))
-        imgAltText = mne_cpp.core.none_if_empty(match.group('alt_text'))
+        imgPath = mne_cpp.core.noneIfEmpty(match.group('imgFilePath'))
+        imgAltText = mne_cpp.core.noneIfEmpty(match.group('alt_text'))
         figText  = '\n\\begin{wrapfigure}{r}{0.5\\textwidth}'
         figText += '\n\t\\begin{center}'
         figText += '\n\t\t\\includegraphics[width=0.4\\textwidth]{ ' + imgPath + '}'
@@ -244,7 +236,7 @@ def parseInlineImages(inText):
 def parseInlineHTMLImages(inText):
     match = re.search(r'<\s*img\s*src\s*=\s*"(?P<imgPath>[^"]+)".*>', inText)
     if match:
-        imgPath = mne_cpp.core.none_if_empty(match.group('imgFilePath'))
+        imgPath = mne_cpp.core.noneIfEmpty(match.group('imgFilePath'))
         figText  = '\n\\begin{wrapfigure}{r}{0.5\\textwidth}'
         figText += '\n\t\\begin{center}'
         figText += '\n\t\t\\includegraphics[width=0.4\\textwidth]{ ' + imgPath + '}'
@@ -256,7 +248,7 @@ def parseInlineHTMLImages(inText):
     else:
         return inText
 
-def parseTableMd(inText)
+def parseTableMd(inText):
     match = re.search(r'(?<=\n)\|([^|\n]+\|)+', inText)
     if match:
         tableText = inText[match.start(0):match.end(0)]
@@ -312,14 +304,67 @@ def parseHeaders(inText):
     else:
         return inText
 
-def parseHorizontalLine(inText):
+def parseHorizontalLines(inText):
     return re.sub(r'(?<=\n)\*\s\*\s\*(?=\n)','\\noindent\\rule{15cm}{0.5pt}', inText)
 
-# parse horizontal line 
-# \n\* \* \*
+def stripHorizontalLines(inText):
+    return re.sub(r'(?<=\n)\*\s\*\s\*(?=\n)','', inText)
+
+def stripEmptyLines(inText):
+    return re.sub(r'((?<=\n)\n)','',inText)
+
+# def parseUnorderedList(inText):
+#     match = re.search(r'(\n\s?\*\s?.+)(\n\s?\*\s?(.+))*', inText)
+#     if match:
+#         outList = '\n\\begin{itemize}\n'
+#         pattern2 = re.compile(r'\n*\s*\*\s*(?P<item>.+)(?=\n)?')
+#         itemList = pattern2.finditer(match.group(0))
+#         for item in itemList:
+#             outList += '\t\\item ' + item.group('item') + '\n'
+#         outList += '\\end{itemize}'
+#         outText = inText[:match.start(0)] + outList + inText[match.end(0):]
+#         return parseUnorderedList(outText)
+#     else:
+#         return inText
+def parseUnorderedList(inText, i):
+    pattern = r'\n(( {0}[-*] *)(?P<itemText>.*))'
+    lastMatch = len(re.findall(pattern, inListText))
+    matches = re.finditer(pattern, inListText)
+    parsedText = ''
+    for numMatch, match in enumerate(matches, start = 1):
+        itemText = '\n\\begin{itemize}' if numMatch is 1 else ''
+        itemText += '\\item ' + match.group('itemText')
+        itemText += '\\end{itemize}' if numMatch is lastMatch
+        parsedText += inListText[:match.start()] + itemText + inListText[match.end():]
+
+
+def parseOneList(inList):
+    outList = parseUnorderedList(inList)
+
+
+
+def parseLists(inText):
+    match = re.search(r'(\n(( *[-*] *)|( *\d+\. *))[^\-*\n ].+)+', inText)
+    if match:
+        parsedList = parseOneList(match.group())
+        outText = inText[:match.start()] + parsedList + inText[match.end():]
+        return parseLists
+    else: 
+        return inText            
+
+
+    # for spaces in range(2:2:6):
+    #     pattern = 
+
+#         matches4ord = re.finditer(r'(\n( {2}(\d+\.) *)([^-\n ].*))+', text[match.start(0):match.end(0)])
+#         for match4ord in matches4ord:
+#             outText = '\n\\begin{enumerate}\n'
+
+# ((\n {2}\d+\. *)(?P<item>.*))
 
 # parse all lists with (\n((\s*[-*]\s*)|(\s*\d+\.\s*)).+)+
-# see https://regex101.com/r/2uKqPB/1/
+# https://regex101.com/r/idzIo5/1/
+# https://regex101.com/r/Iu3hKt/1
 
 # after this parse 
 # ordered lists of level 4
@@ -335,18 +380,16 @@ def parseHorizontalLine(inText):
 # https://tex.stackexchange.com/questions/247681/how-to-create-checkbox-todo-list
 
 
-
-
 # still missing: 
 # ordered and unordered lists parsing
 # inbound links vs outbound links
 # parse inline code
 # preamble and ending file
 # parse multiple terms description/definition
-
+# header tags up to 6 #s
 
 def processImage(imageFile):
-        _, _, _, _, fileExt = mne_cpp.core.parseFilePathNameExt(imageFile)
+    _, _, _, _, fileExt = mne_cpp.core.parseFilePathNameExt(imageFile)
     if fileExt == "jpg" or fileExt == "jpeg":
         jpg2png(imageFile)
     if fileExt == "svg2":