Skip to content
This repository was archived by the owner on Feb 5, 2021. It is now read-only.

Commit e65a16f

Browse files
authored
Release 1.5.1
1 parent fb197c1 commit e65a16f

File tree

1 file changed

+82
-23
lines changed

1 file changed

+82
-23
lines changed

tex2txt.py

+82-23
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
#
2424
# Principle of operation:
2525
# - read complete input text into a string, then make replacements
26-
# - replacements are performed via the wrapper mysub() in order
27-
# to observe deletion and inclusion of line breaks
26+
# - replacements are performed via the "re-implementation" mysub() of
27+
# re.sub() in order to observe deletion and inclusion of line breaks
2828
# - in order to treat nested braces / brackets and some nested
2929
# environments, we construct regular expressions by iteration;
3030
# maximum recognized nesting depth (and thus length of these expressions)
@@ -63,8 +63,8 @@ class Aux: pass
6363
# repl:
6464
# - replacement pattern, r'\d' (d: single digit) extracts text
6565
# from position d in args (counting from 1)
66-
# - escape rules: see replacement argument of re.sub();
67-
# include single backslash: repl=r'...\\...'
66+
# - other escape rules: see escape handling at myexpand() below;
67+
# e.g., include a single backslash: repl=r'...\\...'
6868
# - inclusion of % only as escaped version r'\\%' accepted, will be
6969
# resolved to % at the end by resolve_escapes()
7070
# - inclusion of double backslash \\ and replacement ending with \
@@ -641,7 +641,7 @@ def verbatim(s, mark, ast):
641641

642642
#######################################################################
643643
#
644-
# This wrapper for re.sub() operates a small machinery for
644+
# This "re-implementation" of re.sub() operates a small machinery for
645645
# line number tracking.
646646
# Argument text is a 2-tuple.
647647
# text[0]: the text as string
@@ -668,14 +668,15 @@ def mysub(expr, repl, text, flags=0, extract=None):
668668
if not t:
669669
continue
670670
if type(repl) is str:
671-
r = myexpand(m, repl, text)
671+
ex = myexpand(m, repl, text)
672672
else:
673-
r = repl(m)
674-
if type(r) is tuple:
673+
ex = repl(m)
674+
if type(ex) is tuple:
675675
# replacement contains line number information
676-
nums2 = r[1]
677-
r = r[0]
676+
r = ex[0]
677+
nums2 = ex[1]
678678
else:
679+
r = ex
679680
nums2 = None
680681
res += txt[last:m.start(0)]
681682
last = m.end(0)
@@ -714,8 +715,8 @@ def text_combine(text1, text2):
714715
+ r'|' + re_end_env + r'|\s)*\Z')
715716
(t1, n1) = text1
716717
(t2, n2) = text2
717-
i = t1.rfind('\n') + 1 # i == 0, if not found
718-
if re.search(space, t1[i:]):
718+
if n1[-1] == n2[0] or re.search(space, t1[t1.rfind('\n')+1:]):
719+
# same line numbers at junction or
719720
# only "space" after last line break in text1:
720721
# use first line number from text2 at junction
721722
n = n1[:-1] + n2
@@ -729,9 +730,8 @@ def text_combine(text1, text2):
729730
def text_add_frame(pre, post, text):
730731
return (
731732
pre + text[0] + post,
732-
(-abs(text[1][0]),) * pre.count('\n')
733-
+ text[1]
734-
+ (-abs(text[1][-1]),) * post.count('\n')
733+
(text[1][0],) * pre.count('\n') + text[1]
734+
+ (text[1][-1],) * post.count('\n')
735735
)
736736

737737
# extract text with line number information from a group of a match
@@ -743,11 +743,69 @@ def text_from_match(m ,grp, text):
743743
end = beg + m.group(grp).count('\n') + 1
744744
return (m.group(grp), text[1][beg:end])
745745

746-
# here, we could re-implement parsing of the repl string and provide
747-
# line number information, if a used capturing group spans multiple lines
746+
# expansion of a match from replacement template repl:
747+
# returned text element provides line number information,
748+
# if repl contains a reference to a capturing group
748749
#
749750
def myexpand(m, repl, text):
750-
return m.expand(repl)
751+
# return m.expand(repl) # fail-save version
752+
if not repl:
753+
return ''
754+
755+
# first parse repl: build list 'ops' of
756+
# (strings) and (numbers of referenced capturing groups)
757+
# - compare parse_template() in /usr/lib/python?.?/sre_parse.py
758+
escapes = {
759+
'a': '\a', 'b': '\b', 'f': '\f', 'n': '\n',
760+
'r': '\r', 't': '\t', 'v': '\v', '\\': '\\'
761+
}
762+
ops = []
763+
first = None
764+
cur_str = ''
765+
i = 0
766+
while i < len(repl):
767+
c = repl[i]
768+
i += 1
769+
if c != '\\':
770+
cur_str += c
771+
continue
772+
if i >= len(repl):
773+
cur_str += '\\'
774+
break
775+
c = repl[i]
776+
i += 1
777+
if c in escapes:
778+
cur_str += escapes[c]
779+
elif c in '0g':
780+
fatal('myexpand(): escape sequences \\0... and \\g<...>'
781+
+ ' not implemented')
782+
elif c.isdecimal():
783+
if cur_str:
784+
ops += [cur_str]
785+
cur_str = ''
786+
if first is None:
787+
first = len(ops)
788+
ops += [int(c)]
789+
else:
790+
cur_str += '\\' + c
791+
if cur_str:
792+
ops += [cur_str]
793+
794+
if first is None:
795+
# no group reference found, repl == '' was excluded above
796+
return ops[0]
797+
798+
# build replacement text with line number information
799+
t = text_from_match(m, ops[first], text)
800+
if first > 0:
801+
t = text_add_frame(ops[0], '', t)
802+
for i in range(first + 1, len(ops)):
803+
if type(ops[i]) is int:
804+
t2 = text_from_match(m, ops[i], text)
805+
t = text_combine(t, t2)
806+
else:
807+
t = text_add_frame('', ops[i], t)
808+
return t
751809

752810
def mysearch(expr, text, flags=0):
753811
if type(text) is not tuple:
@@ -1451,15 +1509,16 @@ def f(m):
14511509
s = r'\s+'
14521510
if not t:
14531511
continue
1454-
14551512
if t[0].isalpha():
14561513
t = r'\b' + t # require word boundary
14571514
if t[-1].isalpha():
14581515
t = t + r'\b'
1459-
r = s = ''
1460-
for i in range(i + 1, len(lin)):
1461-
r += s + lin[i]
1462-
s = ' '
1516+
1517+
r = ' '.join(lin[i+1:])
1518+
if re.search(r'(?<!\\)%', r):
1519+
fatal('please use escaped \\% for replacement in file "'
1520+
+ cmdline.repl + '"', r)
1521+
r = re.sub('\\\\', '\\\\\\\\', r) # \ ==> \\
14631522
text = mysub(t, r, text)
14641523

14651524

0 commit comments

Comments
 (0)