Skip to content

Commit 24b7e7c

Browse files
authored
Merge pull request #2226 from Doge2077/master
fix:Chinese Character Garbling in PPTX/DOCX Conversion by Adding Font Check and Installation
2 parents ff35c75 + 87440ba commit 24b7e7c

File tree

3 files changed

+120
-5
lines changed

3 files changed

+120
-5
lines changed

Diff for: docker/china/Dockerfile

+10
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,17 @@ RUN apt-get update && \
1818
wget \
1919
git \
2020
libgl1 \
21+
libreoffice \
22+
fonts-noto-cjk \
23+
fonts-wqy-zenhei \
24+
fonts-wqy-microhei \
25+
ttf-mscorefonts-installer \
26+
fontconfig \
2127
libglib2.0-0 \
28+
libxrender1 \
29+
libsm6 \
30+
libxext6 \
31+
poppler-utils \
2232
&& rm -rf /var/lib/apt/lists/*
2333

2434
# Set Python 3.10 as the default python3

Diff for: docker/global/Dockerfile

+10
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,17 @@ RUN apt-get update && \
1818
wget \
1919
git \
2020
libgl1 \
21+
libreoffice \
22+
fonts-noto-cjk \
23+
fonts-wqy-zenhei \
24+
fonts-wqy-microhei \
25+
ttf-mscorefonts-installer \
26+
fontconfig \
2127
libglib2.0-0 \
28+
libxrender1 \
29+
libsm6 \
30+
libxext6 \
31+
poppler-utils \
2232
&& rm -rf /var/lib/apt/lists/*
2333

2434
# Set Python 3.10 as the default python3

Diff for: magic_pdf/utils/office_to_pdf.py

+100-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import subprocess
3+
import platform
34
from pathlib import Path
5+
import shutil
46

57

68
class ConvertToPdfError(Exception):
@@ -9,21 +11,114 @@ def __init__(self, msg):
911
super().__init__(self.msg)
1012

1113

14+
# Chinese font list
15+
REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
16+
17+
18+
def check_fonts_installed():
19+
"""Check if required Chinese fonts are installed."""
20+
system_type = platform.system()
21+
22+
if system_type == 'Windows':
23+
# Windows: check fonts via registry or system font folder
24+
font_dir = Path("C:/Windows/Fonts")
25+
installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
26+
if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
27+
return True
28+
raise EnvironmentError(
29+
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
30+
)
31+
else:
32+
# Linux/macOS: use fc-list
33+
try:
34+
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
35+
for font in REQUIRED_CHS_FONTS:
36+
if font in output:
37+
return True
38+
raise EnvironmentError(
39+
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
40+
)
41+
except Exception as e:
42+
raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
43+
44+
45+
def get_soffice_command():
46+
"""Return the path to LibreOffice's soffice executable depending on the platform."""
47+
system_type = platform.system()
48+
49+
# First check if soffice is in PATH
50+
soffice_path = shutil.which('soffice')
51+
if soffice_path:
52+
return soffice_path
53+
54+
if system_type == 'Windows':
55+
# Check common installation paths
56+
possible_paths = [
57+
Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
58+
Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
59+
Path('C:/Program Files/LibreOffice/program/soffice.exe'),
60+
Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
61+
]
62+
63+
# Check other drives for windows
64+
for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
65+
possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
66+
67+
for path in possible_paths:
68+
if path.exists():
69+
return str(path)
70+
71+
raise ConvertToPdfError(
72+
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
73+
"or ensure soffice.exe is in your PATH environment variable."
74+
)
75+
else:
76+
# For Linux/macOS, provide installation instructions if not found
77+
try:
78+
# Try to find soffice in standard locations
79+
possible_paths = [
80+
'/usr/bin/soffice',
81+
'/usr/local/bin/soffice',
82+
'/opt/libreoffice/program/soffice',
83+
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
84+
]
85+
for path in possible_paths:
86+
if os.path.exists(path):
87+
return path
88+
89+
raise ConvertToPdfError(
90+
"LibreOffice not found. Please install it:\n"
91+
" - Ubuntu/Debian: sudo apt-get install libreoffice\n"
92+
" - CentOS/RHEL: sudo yum install libreoffice\n"
93+
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
94+
" - Or ensure soffice is in your PATH environment variable."
95+
)
96+
except Exception as e:
97+
raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
98+
99+
12100
def convert_file_to_pdf(input_path, output_dir):
101+
"""Convert a single document (ppt, doc, etc.) to PDF."""
13102
if not os.path.isfile(input_path):
14103
raise FileNotFoundError(f"The input file {input_path} does not exist.")
15104

16105
os.makedirs(output_dir, exist_ok=True)
17-
106+
107+
check_fonts_installed()
108+
109+
soffice_cmd = get_soffice_command()
110+
18111
cmd = [
19-
'soffice',
112+
soffice_cmd,
20113
'--headless',
114+
'--norestore',
115+
'--invisible',
21116
'--convert-to', 'pdf',
22117
'--outdir', str(output_dir),
23118
str(input_path)
24119
]
25-
120+
26121
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27-
122+
28123
if process.returncode != 0:
29-
raise ConvertToPdfError(process.stderr.decode())
124+
raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")

0 commit comments

Comments
 (0)