Skip to content

Commit 7179311

Browse files
authored
向 webmagic-saxon 组件提供若干新 API,更优雅更灵活更强大 (#1108)
* Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。
1 parent f47038d commit 7179311

File tree

4 files changed

+216
-89
lines changed

4 files changed

+216
-89
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package us.codecraft.webmagic.selector;
2+
3+
import org.w3c.dom.Node;
4+
import org.w3c.dom.NodeList;
5+
6+
import javax.xml.transform.OutputKeys;
7+
import javax.xml.transform.Transformer;
8+
import javax.xml.transform.TransformerException;
9+
import javax.xml.transform.TransformerFactory;
10+
import javax.xml.transform.dom.DOMSource;
11+
import javax.xml.transform.stream.StreamResult;
12+
import java.io.StringWriter;
13+
import java.util.ArrayList;
14+
import java.util.Collections;
15+
import java.util.List;
16+
17+
/**
18+
* @author hooy
19+
*/
20+
public final class JaxpSelectorUtils {
21+
22+
private JaxpSelectorUtils() {
23+
throw new RuntimeException("The util class cannot be instanced");
24+
}
25+
26+
public static List<Node> NodeListToArrayList(NodeList nodes) {
27+
List<Node> list = new ArrayList<>(nodes.getLength());
28+
for (int i = 0; i < nodes.getLength(); i++) {
29+
list.add(nodes.item(i));
30+
}
31+
return list;
32+
}
33+
34+
public static String nodeToString(Node node) throws TransformerException {
35+
List<Node> before = Collections.singletonList(node);
36+
List<String> after = nodesToStrings(before);
37+
if (after.size() > 0) {
38+
return after.get(0);
39+
} else {
40+
return null;
41+
}
42+
}
43+
44+
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
45+
List<String> results = new ArrayList<>(nodes.size());
46+
Transformer transformer = TransformerFactory.newInstance().newTransformer();
47+
StreamResult xmlOutput = new StreamResult();
48+
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
49+
for (Node node : nodes) {
50+
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
51+
results.add(node.getTextContent());
52+
} else {
53+
xmlOutput.setWriter(new StringWriter());
54+
transformer.transform(new DOMSource(node), xmlOutput);
55+
results.add(xmlOutput.getWriter().toString());
56+
}
57+
}
58+
return results;
59+
}
60+
61+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package us.codecraft.webmagic.selector;
2+
3+
import org.w3c.dom.Node;
4+
5+
import java.util.List;
6+
7+
/**
8+
* Selector(extractor) for html node.<br>
9+
*
10+
* @author hooy <br>
11+
* @since 0.8.0
12+
*/
13+
public interface NodeSelector {
14+
15+
/**
16+
* Extract single result in text.<br>
17+
* If there are more than one result, only the first will be chosen.
18+
*
19+
* @param node node
20+
* @return result
21+
*/
22+
String select(Node node);
23+
24+
/**
25+
* Extract all results in text.<br>
26+
*
27+
* @param node node
28+
* @return results
29+
*/
30+
List<String> selectList(Node node);
31+
32+
}
Lines changed: 83 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,10 @@
11
package us.codecraft.webmagic.selector;
22

3-
import java.io.StringWriter;
4-
import java.util.ArrayList;
5-
import java.util.Iterator;
6-
import java.util.List;
7-
import java.util.Map;
3+
import java.util.*;
84
import java.util.concurrent.ConcurrentHashMap;
95

106
import javax.xml.namespace.NamespaceContext;
117
import javax.xml.parsers.ParserConfigurationException;
12-
import javax.xml.transform.OutputKeys;
13-
import javax.xml.transform.Transformer;
14-
import javax.xml.transform.TransformerFactory;
15-
import javax.xml.transform.dom.DOMSource;
16-
import javax.xml.transform.stream.StreamResult;
178
import javax.xml.xpath.XPathConstants;
189
import javax.xml.xpath.XPathExpression;
1910
import javax.xml.xpath.XPathExpressionException;
@@ -32,20 +23,22 @@
3223
import net.sf.saxon.xpath.XPathEvaluator;
3324
import us.codecraft.webmagic.utils.BaseSelectorUtils;
3425

26+
import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
27+
3528
/**
3629
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
3730
*
38-
* @author [email protected] <br>
31+
* @author [email protected], hooy <br>
3932
* Date: 13-4-21
4033
* Time: 上午9:39
4134
*/
42-
public class Xpath2Selector implements Selector {
35+
public class Xpath2Selector implements Selector, NodeSelector {
4336

44-
private String xpathStr;
37+
private final String xpathStr;
4538

4639
private XPathExpression xPathExpression;
4740

48-
private Logger logger = LoggerFactory.getLogger(getClass());
41+
private final Logger logger = LoggerFactory.getLogger(getClass());
4942

5043
public Xpath2Selector(String xpathStr) {
5144
this.xpathStr = xpathStr;
@@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) {
5649
}
5750
}
5851

52+
public static Xpath2Selector newInstance(String xpathStr) {
53+
return new Xpath2Selector(xpathStr);
54+
}
55+
5956
enum XPath2NamespaceContext implements NamespaceContext {
6057

6158
INSTANCE;
6259

63-
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
60+
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
6461

65-
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
62+
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
6663

6764
private void put(String prefix, String namespaceURI) {
6865
prefix2NamespaceMap.put(prefix, namespaceURI);
69-
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
70-
if (prefixes == null) {
71-
prefixes = new ArrayList<String>();
72-
namespace2PrefixMap.put(namespaceURI, prefixes);
73-
}
66+
List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
7467
prefixes.add(prefix);
7568
}
7669

77-
private XPath2NamespaceContext() {
70+
XPath2NamespaceContext() {
7871
put("fn", NamespaceConstant.FN);
7972
put("xslt", NamespaceConstant.XSLT);
8073
put("xhtml", NamespaceConstant.XHTML);
@@ -113,29 +106,18 @@ private void init() throws XPathExpressionException {
113106
@Override
114107
public String select(String text) {
115108
try {
116-
Object result;
117-
try {
118-
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
119-
} catch (XPathExpressionException e) {
120-
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
121-
}
122-
if (result instanceof NodeList) {
123-
NodeList nodeList = (NodeList) result;
124-
if (nodeList.getLength() == 0) {
125-
return null;
126-
}
127-
Node item = nodeList.item(0);
128-
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
129-
return item.getTextContent();
130-
} else {
131-
StreamResult xmlOutput = new StreamResult(new StringWriter());
132-
Transformer transformer = TransformerFactory.newInstance().newTransformer();
133-
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
134-
transformer.transform(new DOMSource(item), xmlOutput);
135-
return xmlOutput.getWriter().toString();
136-
}
137-
}
138-
return result.toString();
109+
Document doc = parse(text);
110+
return select(doc);
111+
} catch (Exception e) {
112+
logger.error("select text error! " + xpathStr, e);
113+
}
114+
return null;
115+
}
116+
117+
@Override
118+
public String select(Node node) {
119+
try {
120+
return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
139121
} catch (Exception e) {
140122
logger.error("select text error! " + xpathStr, e);
141123
}
@@ -144,43 +126,72 @@ public String select(String text) {
144126

145127
@Override
146128
public List<String> selectList(String text) {
147-
List<String> results = new ArrayList<String>();
148129
try {
149-
Object result;
150-
try {
151-
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
152-
} catch (XPathExpressionException e) {
153-
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
154-
}
155-
if (result instanceof NodeList) {
156-
NodeList nodeList = (NodeList) result;
157-
Transformer transformer = TransformerFactory.newInstance().newTransformer();
158-
StreamResult xmlOutput = new StreamResult();
159-
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
160-
for (int i = 0; i < nodeList.getLength(); i++) {
161-
Node item = nodeList.item(i);
162-
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
163-
results.add(item.getTextContent());
164-
} else {
165-
xmlOutput.setWriter(new StringWriter());
166-
transformer.transform(new DOMSource(item), xmlOutput);
167-
results.add(xmlOutput.getWriter().toString());
168-
}
169-
}
170-
} else {
171-
results.add(result.toString());
172-
}
130+
Document doc = parse(text);
131+
return selectList(doc);
132+
} catch (Exception e) {
133+
logger.error("select text error! " + xpathStr, e);
134+
}
135+
return null;
136+
}
137+
138+
@Override
139+
public List<String> selectList(Node node) {
140+
try {
141+
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
142+
List<Node> nodes = NodeListToArrayList(result);
143+
return nodesToStrings(nodes);
173144
} catch (Exception e) {
174145
logger.error("select text error! " + xpathStr, e);
175146
}
176-
return results;
147+
return null;
177148
}
178149

179-
private Document parse(String text) throws ParserConfigurationException {
150+
public Node selectNode(String text) {
151+
try {
152+
Document doc = parse(text);
153+
return selectNode(doc);
154+
} catch (Exception e) {
155+
logger.error("select text error! " + xpathStr, e);
156+
}
157+
return null;
158+
}
159+
160+
public Node selectNode(Node node) {
161+
try {
162+
return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
163+
} catch (Exception e) {
164+
logger.error("select text error! " + xpathStr, e);
165+
}
166+
return null;
167+
}
168+
169+
public List<Node> selectNodes(String text) {
170+
try {
171+
Document doc = parse(text);
172+
return selectNodes(doc);
173+
} catch (Exception e) {
174+
logger.error("select text error! " + xpathStr, e);
175+
}
176+
return null;
177+
}
178+
179+
public List<Node> selectNodes(Node node) {
180+
try {
181+
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
182+
return NodeListToArrayList(result);
183+
} catch (Exception e) {
184+
logger.error("select text error! " + xpathStr, e);
185+
}
186+
return null;
187+
}
188+
189+
protected static Document parse(String text) throws ParserConfigurationException {
180190
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
181191
text = BaseSelectorUtils.preParse(text);
182192
HtmlCleaner htmlCleaner = new HtmlCleaner();
183193
TagNode tagNode = htmlCleaner.clean(text);
184194
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
185195
}
196+
186197
}

webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 40 additions & 17 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)