1
1
package us .codecraft .webmagic .selector ;
2
2
3
- import java .io .StringWriter ;
4
- import java .util .ArrayList ;
5
- import java .util .Iterator ;
6
- import java .util .List ;
7
- import java .util .Map ;
3
+ import java .util .*;
8
4
import java .util .concurrent .ConcurrentHashMap ;
9
5
10
6
import javax .xml .namespace .NamespaceContext ;
11
7
import javax .xml .parsers .ParserConfigurationException ;
12
- import javax .xml .transform .OutputKeys ;
13
- import javax .xml .transform .Transformer ;
14
- import javax .xml .transform .TransformerFactory ;
15
- import javax .xml .transform .dom .DOMSource ;
16
- import javax .xml .transform .stream .StreamResult ;
17
8
import javax .xml .xpath .XPathConstants ;
18
9
import javax .xml .xpath .XPathExpression ;
19
10
import javax .xml .xpath .XPathExpressionException ;
32
23
import net .sf .saxon .xpath .XPathEvaluator ;
33
24
import us .codecraft .webmagic .utils .BaseSelectorUtils ;
34
25
26
+ import static us .codecraft .webmagic .selector .JaxpSelectorUtils .*;
27
+
35
28
/**
36
29
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
37
30
*
38
-
31
+ * @author [email protected] , hooy <br>
39
32
* Date: 13-4-21
40
33
* Time: 上午9:39
41
34
*/
42
- public class Xpath2Selector implements Selector {
35
+ public class Xpath2Selector implements Selector , NodeSelector {
43
36
44
- private String xpathStr ;
37
+ private final String xpathStr ;
45
38
46
39
private XPathExpression xPathExpression ;
47
40
48
- private Logger logger = LoggerFactory .getLogger (getClass ());
41
+ private final Logger logger = LoggerFactory .getLogger (getClass ());
49
42
50
43
public Xpath2Selector (String xpathStr ) {
51
44
this .xpathStr = xpathStr ;
@@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) {
56
49
}
57
50
}
58
51
52
+ public static Xpath2Selector newInstance (String xpathStr ) {
53
+ return new Xpath2Selector (xpathStr );
54
+ }
55
+
59
56
enum XPath2NamespaceContext implements NamespaceContext {
60
57
61
58
INSTANCE ;
62
59
63
- private final Map <String , String > prefix2NamespaceMap = new ConcurrentHashMap <String , String >();
60
+ private final Map <String , String > prefix2NamespaceMap = new ConcurrentHashMap <>();
64
61
65
- private final Map <String , List <String >> namespace2PrefixMap = new ConcurrentHashMap <String , List < String > >();
62
+ private final Map <String , List <String >> namespace2PrefixMap = new ConcurrentHashMap <>();
66
63
67
64
private void put (String prefix , String namespaceURI ) {
68
65
prefix2NamespaceMap .put (prefix , namespaceURI );
69
- List <String > prefixes = namespace2PrefixMap .get (namespaceURI );
70
- if (prefixes == null ) {
71
- prefixes = new ArrayList <String >();
72
- namespace2PrefixMap .put (namespaceURI , prefixes );
73
- }
66
+ List <String > prefixes = namespace2PrefixMap .computeIfAbsent (namespaceURI , k -> new ArrayList <>());
74
67
prefixes .add (prefix );
75
68
}
76
69
77
- private XPath2NamespaceContext () {
70
+ XPath2NamespaceContext () {
78
71
put ("fn" , NamespaceConstant .FN );
79
72
put ("xslt" , NamespaceConstant .XSLT );
80
73
put ("xhtml" , NamespaceConstant .XHTML );
@@ -113,29 +106,18 @@ private void init() throws XPathExpressionException {
113
106
@ Override
114
107
public String select (String text ) {
115
108
try {
116
- Object result ;
117
- try {
118
- result = xPathExpression .evaluate (parse (text ), XPathConstants .NODESET );
119
- } catch (XPathExpressionException e ) {
120
- result = xPathExpression .evaluate (parse (text ), XPathConstants .STRING );
121
- }
122
- if (result instanceof NodeList ) {
123
- NodeList nodeList = (NodeList ) result ;
124
- if (nodeList .getLength () == 0 ) {
125
- return null ;
126
- }
127
- Node item = nodeList .item (0 );
128
- if (item .getNodeType () == Node .ATTRIBUTE_NODE || item .getNodeType () == Node .TEXT_NODE ) {
129
- return item .getTextContent ();
130
- } else {
131
- StreamResult xmlOutput = new StreamResult (new StringWriter ());
132
- Transformer transformer = TransformerFactory .newInstance ().newTransformer ();
133
- transformer .setOutputProperty (OutputKeys .OMIT_XML_DECLARATION , "yes" );
134
- transformer .transform (new DOMSource (item ), xmlOutput );
135
- return xmlOutput .getWriter ().toString ();
136
- }
137
- }
138
- return result .toString ();
109
+ Document doc = parse (text );
110
+ return select (doc );
111
+ } catch (Exception e ) {
112
+ logger .error ("select text error! " + xpathStr , e );
113
+ }
114
+ return null ;
115
+ }
116
+
117
+ @ Override
118
+ public String select (Node node ) {
119
+ try {
120
+ return (String ) xPathExpression .evaluate (node , XPathConstants .STRING );
139
121
} catch (Exception e ) {
140
122
logger .error ("select text error! " + xpathStr , e );
141
123
}
@@ -144,43 +126,72 @@ public String select(String text) {
144
126
145
127
@ Override
146
128
public List <String > selectList (String text ) {
147
- List <String > results = new ArrayList <String >();
148
129
try {
149
- Object result ;
150
- try {
151
- result = xPathExpression .evaluate (parse (text ), XPathConstants .NODESET );
152
- } catch (XPathExpressionException e ) {
153
- result = xPathExpression .evaluate (parse (text ), XPathConstants .STRING );
154
- }
155
- if (result instanceof NodeList ) {
156
- NodeList nodeList = (NodeList ) result ;
157
- Transformer transformer = TransformerFactory .newInstance ().newTransformer ();
158
- StreamResult xmlOutput = new StreamResult ();
159
- transformer .setOutputProperty (OutputKeys .OMIT_XML_DECLARATION , "yes" );
160
- for (int i = 0 ; i < nodeList .getLength (); i ++) {
161
- Node item = nodeList .item (i );
162
- if (item .getNodeType () == Node .ATTRIBUTE_NODE || item .getNodeType () == Node .TEXT_NODE ) {
163
- results .add (item .getTextContent ());
164
- } else {
165
- xmlOutput .setWriter (new StringWriter ());
166
- transformer .transform (new DOMSource (item ), xmlOutput );
167
- results .add (xmlOutput .getWriter ().toString ());
168
- }
169
- }
170
- } else {
171
- results .add (result .toString ());
172
- }
130
+ Document doc = parse (text );
131
+ return selectList (doc );
132
+ } catch (Exception e ) {
133
+ logger .error ("select text error! " + xpathStr , e );
134
+ }
135
+ return null ;
136
+ }
137
+
138
+ @ Override
139
+ public List <String > selectList (Node node ) {
140
+ try {
141
+ NodeList result = (NodeList ) xPathExpression .evaluate (node , XPathConstants .NODESET );
142
+ List <Node > nodes = NodeListToArrayList (result );
143
+ return nodesToStrings (nodes );
173
144
} catch (Exception e ) {
174
145
logger .error ("select text error! " + xpathStr , e );
175
146
}
176
- return results ;
147
+ return null ;
177
148
}
178
149
179
- private Document parse (String text ) throws ParserConfigurationException {
150
+ public Node selectNode (String text ) {
151
+ try {
152
+ Document doc = parse (text );
153
+ return selectNode (doc );
154
+ } catch (Exception e ) {
155
+ logger .error ("select text error! " + xpathStr , e );
156
+ }
157
+ return null ;
158
+ }
159
+
160
+ public Node selectNode (Node node ) {
161
+ try {
162
+ return (Node ) xPathExpression .evaluate (node , XPathConstants .NODE );
163
+ } catch (Exception e ) {
164
+ logger .error ("select text error! " + xpathStr , e );
165
+ }
166
+ return null ;
167
+ }
168
+
169
+ public List <Node > selectNodes (String text ) {
170
+ try {
171
+ Document doc = parse (text );
172
+ return selectNodes (doc );
173
+ } catch (Exception e ) {
174
+ logger .error ("select text error! " + xpathStr , e );
175
+ }
176
+ return null ;
177
+ }
178
+
179
+ public List <Node > selectNodes (Node node ) {
180
+ try {
181
+ NodeList result = (NodeList ) xPathExpression .evaluate (node , XPathConstants .NODESET );
182
+ return NodeListToArrayList (result );
183
+ } catch (Exception e ) {
184
+ logger .error ("select text error! " + xpathStr , e );
185
+ }
186
+ return null ;
187
+ }
188
+
189
+ protected static Document parse (String text ) throws ParserConfigurationException {
180
190
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
181
191
text = BaseSelectorUtils .preParse (text );
182
192
HtmlCleaner htmlCleaner = new HtmlCleaner ();
183
193
TagNode tagNode = htmlCleaner .clean (text );
184
194
return new DomSerializer (new CleanerProperties ()).createDOM (tagNode );
185
195
}
196
+
186
197
}
0 commit comments