Skip to content
This repository was archived by the owner on Feb 5, 2021. It is now read-only.

Commit 349dbbc

Browse files
authored
Release 1.3.3
1 parent 577e39f commit 349dbbc

File tree

1 file changed

+84
-134
lines changed

1 file changed

+84
-134
lines changed

tex2txt.py

+84-134
Original file line numberDiff line numberDiff line change
@@ -13,52 +13,13 @@
1313
# along with this program. If not, see <https://www.gnu.org/licenses/>.
1414
#
1515

16+
#######################################################################
17+
#
1618
# Python3:
1719
# Extract raw text from LaTeX file, write result to standard output
1820
#
19-
# . output suitable for check, e.g., with LanguageTool (LT)
20-
# . we make an effort to avoid creation of additional empty lines that
21-
# break sentences for LT; this keeps number of "false" LT warnings low
22-
# . line number changes caused by this approach can be compensated by
23-
# a small filter for LT messages using the file from option --nums
24-
# . interpunction in displayed equations can be checked to a certain extent
25-
#
26-
# - argument:
27-
# name of file with input text; read standard input if missing
28-
# - option --nums file: (file name)
29-
# file for storing original line numbers;
30-
# can be used later to correct line numbers in messages
31-
# - option --repl file: (file name)
32-
# file with replacements performed at the end, namely after
33-
# changing, e.g., inline maths to text and german hyphen "= to - ;
34-
# see LAB:SPELLING below for line syntax
35-
# - option --extr ma[,mb,...]: (list of macro names)
36-
# extract only first braced argument of these macros;
37-
# useful, e.g., for check of foreign-language text and footnotes
38-
# - option --lang xy: (language de or en, default: de)
39-
# used for adaptation of equation replacements, math operator names,
40-
# proof titles, and replacement of foreign-language text;
41-
# see LAB:LANGUAGE below
42-
# - option --unkn:
43-
# print list of "undeclared" macros and environments
44-
#
45-
# Some actions:
46-
# - \begin{...} and \end{...} of environments are deleted;
47-
# tailored behaviour for some environment types listed below
48-
# - text in heading macros as \section{...} is extracted
49-
# - placeholders for \ref, \eqref, \pageref, and \cite
50-
# - "undeclared" macros are silently ignored
51-
# - inline math $...$ is replaced with text from rotating collection
52-
# in variable parms.inline_math
53-
# - equation environments are resolved in a way suitable for check of
54-
# interpunction, argument of \text{...} is included into output text;
55-
# \[ ... \] is same as environment equation*;
56-
# see LAB:EQUATIONS below for example and detailed description
57-
# - some treatment for \item[...] labels, see LAB:ITEMS
58-
# - rare LT warnings can be suppressed using \LTadd, \LTskip,
59-
# and \LTalter (see below) in the LaTeX text with suitable macro
60-
# definitions there, e.g. adding something for LT only:
61-
# \newcommand{\LTadd}[1]{}
21+
# Usage and main operations:
22+
# - see README
6223
#
6324
# Principle of operation:
6425
# - read complete input text into a string, then make replacements
@@ -98,6 +59,7 @@ class Aux: pass
9859
# args:
9960
# - A: mandatory {...} argument
10061
# - O: optional [...] argument
62+
# - P: mandatory [...] argument, see for instance \cite
10163
# repl:
10264
# - replacement pattern, r'\d' (d: single digit) extracts text
10365
# from position d in args (counting from 1);
@@ -162,9 +124,12 @@ class Aux: pass
162124
# BUG: quite probably, some macro is missing here ;-)
163125
#
164126
parms.system_macros = lambda: (
127+
Macro('cite', 'A', '[1]'),
128+
Macro('cite', 'PA', r'[1, \1]'),
165129
Macro('color', 'A'),
166130
Macro('colorbox', 'AA', r'\2'),
167131
Macro('documentclass', 'OA'),
132+
Macro('eqref', 'A', '(7)'),
168133
Macro('fcolorbox', 'AAA', r'\3'),
169134
Macro('footnote', 'OA', '5'),
170135
Macro('footnotemark', 'O', '5'),
@@ -175,6 +140,8 @@ class Aux: pass
175140
Macro('includegraphics', 'OA'),
176141
Macro('input', 'A'),
177142
Macro('newcommand', 'AOA'),
143+
Macro('pageref', 'A', '99'),
144+
Macro('ref', 'A', '13'),
178145
Macro('texorpdfstring', 'AA', r'\1'),
179146
Macro('textcolor', 'AA', r'\2'),
180147
Macro('usepackage', 'OA'),
@@ -207,20 +174,6 @@ class Aux: pass
207174
r'subsubsection\*?',
208175
)
209176

210-
# theorem environments from package amsthm with optional argument [...]:
211-
# display a title and text in optional argument as (...) with final dot
212-
#
213-
parms.theorem_environments = lambda: (
214-
# (environment name, text title)
215-
('Anmerkung', 'Anmerkung'),
216-
('Beispiel', 'Beispiel'),
217-
('Definition', 'Definition'),
218-
('Korollar', 'Korollar'),
219-
('Nachweis', 'Nachweis'),
220-
('Proposition', 'Proposition'),
221-
('Satz', 'Satz'),
222-
)
223-
224177
# equation environments, partly from LaTeX package amsmath;
225178
# see comments at LAB:EQUATIONS below
226179
#
@@ -257,15 +210,49 @@ class Aux: pass
257210
)
258211

259212
# at the end, we delete all unknown "standard" environment frames;
260-
# these are environments with options / arguments at \begin{...}
213+
# here are environments with options / arguments at \begin{...},
214+
# or with a replacement text for \begin{...}
261215
#
262-
# EnvBegArg(name, args)
216+
# EnvBegin(name, args='', repl='')
263217
# - args: as for Macro()
218+
# - repl: as for Macro()
219+
#
220+
parms.environment_begins = lambda: (
221+
EnvBegin('figure', 'O'),
222+
EnvBegin('minipage', 'A'),
223+
EnvBegin('tabular', 'A'),
224+
225+
# proof: try replacement with option, and only after that without
226+
EnvBegin('proof', 'P', r'\1.'),
227+
EnvBegin('proof', '', parms.proof_title + '.'),
228+
229+
# theorems: same order as for proof
230+
) + tuple(EnvBegin(env, 'P', title + r' 1.2 (\1).')
231+
for (env, title) in parms.theorem_environments()
232+
) + tuple(EnvBegin(env, '', title + ' 1.2.')
233+
for (env, title) in parms.theorem_environments()
234+
)
235+
236+
# theorem environments from package amsthm with optional argument [...]:
237+
# display a title and text in optional argument as (...) with final dot
264238
#
265-
parms.environments_with_args = lambda: (
266-
EnvBegArg('figure', 'O'),
267-
EnvBegArg('minipage', 'A'),
268-
EnvBegArg('tabular', 'A'),
239+
parms.theorem_environments = lambda: (
240+
# (environment name, text title)
241+
('Anmerkung', 'Anmerkung'),
242+
('Beispiel', 'Beispiel'),
243+
('Definition', 'Definition'),
244+
('Korollar', 'Korollar'),
245+
('Nachweis', 'Nachweis'),
246+
('Proposition', 'Proposition'),
247+
('Satz', 'Satz'),
248+
249+
('corollary', 'Corollary'),
250+
('definition', 'Definition'),
251+
('example', 'Example'),
252+
('lemma', 'Lemma'),
253+
('proposition', 'Proposition'),
254+
('remark', 'Remark'),
255+
('theorem', 'Theorem'),
269256
)
270257

271258
# a list of 2-tuples for other things to be replaced
@@ -367,8 +354,6 @@ def set_language_en():
367354
# further replacements performed below:
368355
#
369356
# - replacement of $...$ inline math
370-
# - proof environment
371-
# - macros for cross references
372357
# - handling of displayed equations
373358
# - some treatment of \item[...] labels
374359
# - environments not listed above: \begin{...} and \end{...} deleted
@@ -470,8 +455,8 @@ def EquEnv(name, args='', repl=''):
470455
return (name, args, repl)
471456
def EnvRepl(name, repl=''):
472457
return (name, repl)
473-
def EnvBegArg(name, args=''):
474-
return (name, args)
458+
def EnvBegin(name, args='', repl=''):
459+
return (name, args, repl)
475460
def re_code_args(args, who, s):
476461
# return regular expression for 'OAA' code
477462
ret = ''
@@ -480,9 +465,17 @@ def re_code_args(args, who, s):
480465
ret += sp_braced
481466
elif a == 'O':
482467
ret += r'(?:' + sp_bracketed + r')?'
468+
elif a == 'P':
469+
ret += sp_bracketed
483470
else:
484471
fatal(who + "('" + s + "',...): bad argument code '" + args + "'")
485472
return ret
473+
def check_repl_string(args, repl, who, s):
474+
for m in re.finditer(r'\\(\d)', repl):
475+
n = int(m.group(1))
476+
if n < 1 or n > len(args):
477+
fatal('invalid "\\' + m.group(1) + '" in replacement for '
478+
+ who + "('" + s + "', ...)")
486479

487480
# the expression r'\\to\b' does not work as expected on \to0
488481
# --> use r'\\to' + end_mac
@@ -496,16 +489,9 @@ def re_code_args(args, who, s):
496489
skip_space_macro = (r'(?:[ \t]*(?:\n(?=[ \t]*\S)(?![ \t]*\\begin'
497490
+ end_mac + r'))?[ \t]*)')
498491

499-
# these RE match beginning and end of arbitrary "standard" environments,
500-
# and those with arguments at \begin as declared above
492+
# these RE match beginning and end of arbitrary "standard" environments
501493
#
502-
re_begin_env = op = ''
503-
for (name, args) in parms.environments_with_args():
504-
expr = begin_lbr + name + r'\}' + re_code_args(args, 'EnvBegArg', name)
505-
re_begin_env += op + r'(?:' + expr + r')'
506-
op = r'|'
507-
re_begin_env += op + r'(?:' + begin_lbr + r'[^\\{}]+\})'
508-
re_begin_env = r'(?:' + re_begin_env + r')'
494+
re_begin_env = begin_lbr + r'[^\\{}]+\}'
509495
re_end_env = end_lbr + r'[^\\{}]+\}'
510496

511497
# UTF-8 characters;
@@ -722,15 +708,10 @@ def f(m):
722708
f
723709
)]
724710

725-
for (s, t) in parms.theorem_environments():
726-
actions += [
727-
# first try with option ...
728-
(begin_lbr + s + r'\}' + sp_bracketed, t + r' 1.2 (\1).'),
729-
# ... and then without
730-
(begin_lbr + s + r'\}', t + r' 1.2.'),
731-
# delete \end{...}
732-
(eat_eol(end_lbr + s + r'\}'), eol2space),
733-
]
711+
for (name, args, repl) in parms.environment_begins():
712+
expr = begin_lbr + name + r'\}' + re_code_args(args, 'EnvBegin', name)
713+
check_repl_string(args, repl, 'EnvBegin', name)
714+
actions += [(expr, r'\\begin{%}' + repl)]
734715

735716
# replace $...$ by text from variable parms.inline_math
736717
# BUG (with warning): fails e.g. on $x \text{ for $x>0$}$
@@ -743,27 +724,6 @@ def f(m):
743724
return parms.inline_math[0]
744725
actions += [(r'(?<!\\)\$((?:' + braced + r'|[^\\$]|\\.)*)\$', f)]
745726

746-
# proof environment with optional [...]:
747-
# extract text in [...] and append '.'
748-
#
749-
actions += [
750-
# first try version with option ...
751-
(begin_lbr + r'proof\}' + sp_bracketed, r'\1.'),
752-
# ... then without
753-
(begin_lbr + r'proof\}', parms.proof_title + '.'),
754-
(eat_eol(end_lbr + r'proof\}'), eol2space)
755-
]
756-
757-
# replace \cite, \eqref, \ref, \pageref
758-
#
759-
actions += [
760-
(r'\\cite' + sp_bracketed + sp_braced, r'[1, \1]'),
761-
(r'\\cite' + sp_braced, '[1]'),
762-
(r'\\eqref' + sp_braced, '(7)'),
763-
(r'\\ref' + sp_braced, '13'),
764-
(r'\\pageref' + sp_braced, '99')
765-
]
766-
767727
# now perform the collected replacement actions
768728
#
769729
for (expr, repl) in actions:
@@ -786,12 +746,7 @@ def f(m):
786746
expr = (r'(?:(?:' + expr + r'(?!' + skip_space + r'[[{])'
787747
+ skip_space_macro + r')|(?:'
788748
+ expr + re_code_args(args, 'Macro', name) + r'))')
789-
for m in re.finditer(r'\\(\d)', repl):
790-
# make error messages more accessible (hopefully)
791-
n = int(m.group(1))
792-
if n < 1 or n > len(args):
793-
fatal('inavlid "\\' + m.group(1) + '" in replacement for "'
794-
+ name + '"')
749+
check_repl_string(args, repl, 'Macro', name)
795750
while mysearch(expr, text):
796751
# macro might be nested
797752
text = mysub(expr, mark_deleted + repl, text)
@@ -804,28 +759,24 @@ def f(m):
804759
##################################################################
805760

806761
# example:
807-
808-
"""
809-
Thus,
810-
%
811-
\begin{align}
812-
\mu &= f(x) \quad\text{for all } \mu\in\Omega, \notag \\
813-
x &= \begin{cases}
814-
0 & \text{ for} \ y>0 \\
815-
1 & \text{ in case} y\le 0.
816-
\end{cases} \label{lab}
817-
\end{align}
818-
"""
819-
762+
#
763+
# Thus,
764+
# %
765+
# \begin{align}
766+
# \mu &= f(x) \quad\text{for all } \mu\in\Omega, \notag \\
767+
# x &= \begin{cases}
768+
# 0 & \text{ for} \ y>0 \\
769+
# 1 & \text{ in case} y\le 0.
770+
# \end{cases} \label{lab}
771+
# \end{align}
772+
#
820773
# becomes with parms.change_repl_after_punct == True
821774
# and --lang en:
822-
823-
"""
824-
Thus,
825-
U equal V for all W,
826-
X equal Y for Z
827-
Z in caseU.
828-
"""
775+
#
776+
# Thus,
777+
# U equal V for all W,
778+
# X equal Y for Z
779+
# Z in caseU.
829780

830781
# 1. split equation environment into 'lines' delimited by \\ alias \newline
831782
# 2. split each 'line' into 'sections' delimited by &
@@ -1027,13 +978,12 @@ def f(m):
1027978
if m not in macsknown:
1028979
print('\\' + m)
1029980
envs = []
1030-
envsknown = ('%',) + tuple(e[0] for e in parms.environments_with_args())
1031981
for m in re.finditer(begin_lbr + r'([^\\{}]+)\}', text_get_txt(text)):
1032982
if m.group(1) not in envs:
1033983
envs += [m.group(1)]
1034984
envs.sort()
1035985
for e in envs:
1036-
if e not in envsknown:
986+
if e != '%':
1037987
print(r'\begin{' + e + '}')
1038988
exit()
1039989

0 commit comments

Comments
 (0)