XML to tab

Print as transpose if data is too wide to view from left to right.

Python, regex experiment.

# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import re
# xml2Tab_t.py drill4Opseq.xml 20 ProductData Machine MachiningData SequenceOperation > hasil.txt

# note:
# NOf = NumberOf
# [0]funtionName, [1]XML, [2]maxNumOfColumn, [3]wantedTag

minNOfArgv = 3
maxNOfColumn = int(sys.argv[minNOfArgv-1])

# --------------------------------

def printAs(type, line):
    if type=="title":
        doPrint("\s", "=", line)
    elif type=="data":
        doPrint("\"", "\"",  line)

def storeAs(dType, tType, line, array):
    if dType=="title":
        line = re.sub(r"^\s+", "", line)
        beg = "\s"
        en = "="        
    elif dType=="data":
        beg = "\""
        en = "\""

    reg = beg + r"(.*?)" + en
    i=0
    j=0

    if tType!="transpose" and tType!="normal":
        nResult = len(re.findall(reg, line, re.S))
        if nResult < maxNOfColumn:
            if nResult!=0:
                tType="normal"            
        else:
            tType="transpose"          
                
    for match in re.finditer(reg, line, re.S):        
        if tType=="transpose":            
            if dType=="title":
                array.append([]) #
                array[i].append(match.group(1))
                i=i+1
            elif dType=="data":
                array[i].append(match.group(1))
                i=i+1
        elif tType=="normal":
            if dType=="title":
                if len(array)==0:
                    array.append([]) #
                array[0].append(match.group(1))
            elif dType=="data":
                if i==0:
                    array.append([])
                    i=len(array)
                array[i-1].append(match.group(1))            
def doPrint(beg, en, line):    
    reg = beg + r"(.*?)" + en
        
    "remove whitespace @beg.of line, esp. for title"
    line = re.sub(r"^\s+", "", line)

    if re.search(reg, line):  #biar bisa \n di luar for loop
            "print substring between beg and end"
        for match in re.finditer(reg, line, re.S):
            print(match.group(1), end="")
            print("\t", end="")
        print("")

def printArray(type, array):     
    for i in xrange(0, len(array)):
        for j in xrange(0, len(array[i])):
            print(str(array[i][j]), end="")
            print("\t", end="")
        print("")
    print("")

def main():    
    iLine = 0              # index line
    cArg = 0               # current (observed) argument
    insideTag = False
    aData = []
    
    if len(sys.argv) < minNOfArgv + 1:
        # note: sys.argv[0] is function name
        return    
        fo = open(sys.argv[1])    
    for sLine in fo:
        
        "check boundary (e.g., tag "
        for a in xrange(0, len(sys.argv) - minNOfArgv):
            reg = r"\<" + sys.argv[a + minNOfArgv]
            if re.search(reg, sLine):
                insideTag = True
                cArg = a
                iLine=1                
            reg = r"\<\/" + sys.argv[a + minNOfArgv]
            if re.search(reg, sLine):
                insideTag = False
                if sys.argv[cArg + minNOfArgv]=="ProductData":
                    print("")
                else:
                    print("<" + sys.argv[cArg + minNOfArgv]+">")
                    printArray("", aData)

        "get data"
        if insideTag:
            if sys.argv[cArg + minNOfArgv]=="ProductData":
                if iLine==2:
                    print("<"+sys.argv[cArg + minNOfArgv]+">")
                    printAs("title", sLine)
                    printAs("data", sLine)
                    print("")
                elif iLine==3:
                    printAs("title", sLine)
                    printAs("data", sLine)
                else:
                    printAs("data", sLine)
            else:
                if iLine==2:
                    aData[:]=[]
                    storeAs("title", "", sLine, aData)
                    storeAs("data", "", sLine, aData)
                else:
                    storeAs("data", "", sLine, aData)
                    
            iLine = iLine + 1    
    fo.close()    

if __name__ == "__main__":
    main()

Posting code to wordpress

Python, regex

  1. Add <pre> in the beginning of the code
  2. Add </pre> at the very end of the code
  3. If indentation is needed, use &nbsp;
    In this case, 4 backspace is converted into 4 &nbsp;
    (because double tab doesn’t seem working)

.bat (double click)

main.py wantedCode > result.txt

main.py

# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import re

def main():
    i = 0
    fh = open(sys.argv[1])    
    for line in fh:        	
	if i==0:
            print("<pre>", end="")
        line = re.sub(r"\s\s\s\s", "    ", line)
        #line = re.sub("    ", "\t", line)
        print(line, end="")
        i=i+1        

    print("</pre>", end="")
    fh.close()    

if __name__ == "__main__":
    main()

Additional:

HTML Seen
print(“&lt;pre&gt;”, end=””) print(“<pre>”, end=””)
print(“&lt;/pre&gt;”, end=””) print(“</pre>”, end=””)

Print inside tag XML

Regex, python

1. interface .bat (double click), e.g., execute.bat

main.py drill4Opseq.xml tag1 tag2 > hasil.txt

2. python script, e.g., main.py

# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import re

def printData(beg, en, line):    
    reg = beg + r"(.*?)" + en
        
    "remove whitespace @beg.of line, esp. for title"
    line = re.sub(r"^\s+", "", line)

    if re.search(reg, line):    
        "print substring between beg and end"
        for match in re.finditer(reg, line, re.S):
            print(match.group(1), end="")
            print("\t", end="")
        print("")

def main():
    i = 0
    arg = 0
    insideTag = False
    #app = QtGui.QApplication(sys.argv)
    if len(sys.argv)")
                printData("\s", "=", line)
            printData("\"", "\"",  line)
            i=i+1
    fh.close()    

if __name__ == "__main__":
    main()

ps: pakai &nbsp; buat indent 😀

Regex with Python

Expression:

  • between double quote
  • to mark regex as raw string, use r in front of expression, such as
    r"^a.*\.html$"

    this way, no need to use \\

Contents:

  1. Frequently used command
  2. Metacharacters
  3. Example inside character class []
  4. General example
  5. Special sequences

1. FREQUENTLY USED COMMAND:

No Command Title
1 re.search(expression, substring) match expression in a substring
re.search(“cat”, “a cat and a rat can’t be friends”)
2 re.match(expression, substring) match ONLY in the BEGINNING of the string. This command is only in python (can’t be used in perl)
print re.match(r”M[ae][iy]er”, “He is called Meyer”)
returns None
3 re.MULTILINE

or

re.M

match also after newline
s = s2 + "\n" + s1
print re.search(r"^M[ae][iy]er", s, re.MULTILINE)
returns None

2. METACHARACTERS

A metacharacter = a character that has special meaning

Metacharacter Meaning
[] Character class, means either, if any
^ Caret, means negation (but/except), only if it is used directly after [
. any character (wildcard? biasanya pakai *)
* asterisk or star,
means that this subexpression may be repeated arbitrarily (including 0 times)
^ matches the start of the string
re.search(r"^M[ae][iy]er", "Mayer is a very common name")
$ matches the end of the string
(or just before newline if its in MULTILINE mode)
e.g., re.search(r"Python\.$","I like Python and Perl.")

* usually, metacharacters should be preceded by a backslash in order to drop their special meaning, and be treated literally inside an expression

3. EXAMPLE INSIDE CHARACTER CLASS []

Expression Meaning
[ae] either a or e
r”M[ae][iy]er” either Maier, Mayer, Meier, or Meyer
[a-e] [abcde]
[a-z] [abc … z]
[-a-z] [-abc … z], i.e., – o & between [abc … z]
[0-5] [012345]
[A-Z] any uppercase letter
[^abc] anything but a, b, or c
[a^bc] a, b, c, or ^
.at any 3 letter words, ended by “at”

4. GENERAL EXAMPLE

Expression Meaning
r”^a.*\.html$” substring started with an a and end with .html
r”M[ae][iy]er” either:
Maier,
Mayer,
Meier, or
Meyer
r”[a-zA-Z0-9_]” all lowercase characters
+ uppercase characters
+ all digits
+ underscore
[a-z] [abc … z]
[-a-z] [-abc … z], i.e., – o & between [abc … z]
[0-5] [012345]
[A-Z] any uppercase letter
[^abc] anything but a, b, or c
[a^bc] a, b, c, or ^
.at any 3 letter words, ended by “at”
r”M[ae][iy]e?r” either:
Maier, Mair,
Mayer, Mayr,
Meier, Meir
Meyer, Meyr
r”[0-9]*” any sequence of digits, even if its empty string

5. SPECIAL SEQUENCES

Expression Meaning
\d [0-9], any decimal digit
\D complement of \d
i.e., [^0-9], any non-digit characters
\s [\t\n\r\f\v], whitespace characters
\S complement of \s
i.e., [^\t\n\r\f\v]
\w [a-zA-Z0-9_], any alphanumeric characters
\W complement of \w (symbols?)
\b empty string, but only at the start or end of a word
\B empty string, but NOT at the start or end of a word
\\ literal backslash
.at any 3 letter words, ended by “at”

 

Contoh code (for personal use) [ps: how to add indent? #pythonProblem]

# -*- coding: utf-8 -*-
from __future__ import print_function
from PySide import QtCore, QtGui
import sys
import re 

def main(): 
    i = 1
    app = QtGui.QApplication(sys.argv)
    fh = open("analyzeMe.txt")
    #print str(i)
    for line in fh:
        if i==1:
        line = re.sub(r"^\s+", "", line)
        line = re.sub(r"\<MD", "", line) 
        
        for match in re.finditer(r'\s(.*?)=', line, re.S):
            print (match.group(1), end="")
            print ("\t", end="")
            print("")

        for match in re.finditer(r'"(.*?)"', line, re.S): 
        print (match.group(1), end="")
        print ("\t", end="")
        i=i+1
        print("")
    fh.close()
    #sys.exit(app.exec_())

if __name__ == "__main__":
 main()

References

  1. http://www.python-course.eu/re.php
  2. https://docs.python.org/2/library/re.html
  3. https://en.wikibooks.org/wiki/Python_Programming/Regular_Expression

Kapan2 dibaca

  1. http://stackoverflow.com/questions/171480/regex-grabbing-values-between-quotation-marks
  2. http://stackoverflow.com/questions/9085558/python-regex-match-text-between-quotes