|
11 | 11 | import edu.stanford.nlp.ling.tokensregex.SequenceMatchResult;
|
12 | 12 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
|
13 | 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
|
| 14 | +import edu.stanford.nlp.scenegraph.RuleBasedParser; |
| 15 | +import edu.stanford.nlp.scenegraph.SceneGraph; |
14 | 16 | import edu.stanford.nlp.semgraph.SemanticGraph;
|
15 | 17 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
|
16 | 18 | import edu.stanford.nlp.semgraph.semgrex.ProcessSemgrexRequest;
|
@@ -109,6 +111,8 @@ public class StanfordCoreNLPServer implements Runnable {
|
109 | 111 | */
|
110 | 112 | private SoftReference<Pair<String, StanfordCoreNLP>> lastPipeline = new SoftReference<>(null);
|
111 | 113 |
|
| 114 | + private RuleBasedParser sceneParser = null; |
| 115 | + |
112 | 116 | /**
|
113 | 117 | * An executor to time out CoreNLP execution with.
|
114 | 118 | */
|
@@ -295,35 +299,18 @@ private static Map<String, String> getURLParams(URI uri) throws UnsupportedEncod
|
295 | 299 | * @throws ClassNotFoundException Thrown if we cannot load the serializer.
|
296 | 300 | */
|
297 | 301 | private Annotation getDocument(Properties props, HttpExchange httpExchange) throws IOException, ClassNotFoundException {
|
298 |
| - String inputFormat = props.getProperty("inputFormat", "text"); |
| 302 | + final String inputFormat = props.getProperty("inputFormat", "text"); |
299 | 303 | String date = props.getProperty("date");
|
300 | 304 | switch (inputFormat) {
|
301 | 305 | case "text":
|
302 |
| - // The default encoding by the HTTP standard is ISO-8859-1, but most |
303 |
| - // real users of CoreNLP would likely assume UTF-8 by default. |
304 |
| - String defaultEncoding = this.strict ? "ISO-8859-1" : "UTF-8"; |
305 |
| - // Get the encoding |
306 | 306 | Headers headers = httpExchange.getRequestHeaders();
|
307 |
| - String encoding; |
308 | 307 | // the original default behavior of the server was to
|
309 | 308 | // unescape, so let's assume by default that the input text is
|
310 | 309 | // escaped. if the Content-type is set to text we will know
|
311 | 310 | // we shouldn't unescape after all
|
312 |
| - String contentType = URL_ENCODED; |
313 |
| - if (headers.containsKey("Content-type")) { |
314 |
| - contentType = headers.getFirst("Content-type").split(";")[0].trim(); |
315 |
| - String[] charsetPair = Arrays.stream(headers.getFirst("Content-type").split(";")) |
316 |
| - .map(x -> x.split("=")) |
317 |
| - .filter(x -> x.length > 0 && "charset".equals(x[0])) |
318 |
| - .findFirst().orElse(new String[]{"charset", defaultEncoding}); |
319 |
| - if (charsetPair.length == 2) { |
320 |
| - encoding = charsetPair[1]; |
321 |
| - } else { |
322 |
| - encoding = defaultEncoding; |
323 |
| - } |
324 |
| - } else { |
325 |
| - encoding = defaultEncoding; |
326 |
| - } |
| 311 | + final String contentType = getContentType(headers); |
| 312 | + // Get the encoding |
| 313 | + final String encoding = getEncoding(headers); |
327 | 314 |
|
328 | 315 | String text = IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding));
|
329 | 316 | if (contentType.equals(URL_ENCODED)) {
|
@@ -352,6 +339,71 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
|
352 | 339 | }
|
353 | 340 | }
|
354 | 341 |
|
| 342 | + private String getContentType(Headers headers) { |
| 343 | + String contentType = URL_ENCODED; |
| 344 | + if (headers.containsKey("Content-type")) { |
| 345 | + contentType = headers.getFirst("Content-type").split(";")[0].trim(); |
| 346 | + } |
| 347 | + return contentType; |
| 348 | + } |
| 349 | + |
| 350 | + private String getEncoding(Headers headers) { |
| 351 | + // The default encoding by the HTTP standard is ISO-8859-1, but most |
| 352 | + // real users of CoreNLP would likely assume UTF-8 by default. |
| 353 | + String defaultEncoding = this.strict ? "ISO-8859-1" : "UTF-8"; |
| 354 | + if (headers.containsKey("Content-type")) { |
| 355 | + String[] charsetPair = Arrays.stream(headers.getFirst("Content-type").split(";")) |
| 356 | + .map(x -> x.split("=")) |
| 357 | + .filter(x -> x.length > 0 && "charset".equals(x[0])) |
| 358 | + .findFirst().orElse(new String[]{"charset", defaultEncoding}); |
| 359 | + if (charsetPair.length == 2) { |
| 360 | + return charsetPair[1]; |
| 361 | + } else { |
| 362 | + return defaultEncoding; |
| 363 | + } |
| 364 | + } else { |
| 365 | + return defaultEncoding; |
| 366 | + } |
| 367 | + } |
| 368 | + |
| 369 | + /** |
| 370 | + * Get a SceneGraph request from the query, either from a query parameter (q) |
| 371 | + * or from the body of the request |
| 372 | + * <br> |
| 373 | + * TODO: don't actually know if the scenegraph parser is threadsafe. |
| 374 | + * |
| 375 | + * @return query |
| 376 | + */ |
| 377 | + private String getSceneGraphRequest(Properties props, HttpExchange httpExchange) throws IOException, ClassNotFoundException { |
| 378 | + final String inputFormat = props.getProperty("inputFormat", "text"); |
| 379 | + if (!inputFormat.equals("text")) { |
| 380 | + throw new IOException("Unhandled input format for scenegraph: " + inputFormat); |
| 381 | + } |
| 382 | + String query = props.getProperty("q", null); |
| 383 | + if (query != null) { |
| 384 | + return query; |
| 385 | + } |
| 386 | + |
| 387 | + Headers headers = httpExchange.getRequestHeaders(); |
| 388 | + // the original default behavior of the server was to |
| 389 | + // unescape, so let's assume by default that the input text is |
| 390 | + // escaped. if the Content-type is set to text we will know |
| 391 | + // we shouldn't unescape after all |
| 392 | + final String contentType = getContentType(headers); |
| 393 | + // Get the encoding |
| 394 | + final String encoding = getEncoding(headers); |
| 395 | + |
| 396 | + String text = IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding)); |
| 397 | + if (contentType.equals(URL_ENCODED)) { |
| 398 | + try { |
| 399 | + text = URLDecoder.decode(text, encoding); |
| 400 | + } catch (IllegalArgumentException e) { |
| 401 | + // ignore decoding errors so that libraries which don't specify a content type might not fail |
| 402 | + } |
| 403 | + } |
| 404 | + |
| 405 | + return text; |
| 406 | + } |
355 | 407 |
|
356 | 408 | /**
|
357 | 409 | * Create (or retrieve) a StanfordCoreNLP object corresponding to these properties.
|
@@ -394,6 +446,29 @@ private StanfordCoreNLP mkStanfordCoreNLP(Properties props) {
|
394 | 446 | return impl;
|
395 | 447 | }
|
396 | 448 |
|
| 449 | + /** |
| 450 | + * This server has at most one SceneGraph parser, and it is not created at startup time |
| 451 | + * as most applications will not use it. |
| 452 | + * <br> |
| 453 | + * This function call creates it in a synchronized manner, so at most one is ever created. |
| 454 | + * <br> |
| 455 | + * @return RuleBasedParser |
| 456 | + */ |
| 457 | + private RuleBasedParser mkSceneGraphParser() { |
| 458 | + if (sceneParser != null) { |
| 459 | + return sceneParser; |
| 460 | + } |
| 461 | + synchronized (this) { |
| 462 | + // in case it got created in another thread |
| 463 | + if (sceneParser != null) { |
| 464 | + return sceneParser; |
| 465 | + } |
| 466 | + RuleBasedParser parser = new RuleBasedParser(); |
| 467 | + sceneParser = parser; |
| 468 | + return parser; |
| 469 | + } |
| 470 | + } |
| 471 | + |
397 | 472 | /**
|
398 | 473 | * Parse the parameters of a connection into a CoreNLP properties file that can be passed into
|
399 | 474 | * {@link StanfordCoreNLP}, and used in the I/O stages.
|
@@ -1404,6 +1479,115 @@ public void handle(HttpExchange httpExchange) throws IOException {
|
1404 | 1479 | }
|
1405 | 1480 | }
|
1406 | 1481 |
|
| 1482 | + /** |
| 1483 | + * A handler for executing scenegraph on text |
| 1484 | + */ |
| 1485 | + protected class SceneGraphHandler implements HttpHandler { |
| 1486 | + |
| 1487 | + /** |
| 1488 | + * An authenticator to determine if we can perform this API request. |
| 1489 | + */ |
| 1490 | + private final Predicate<Properties> authenticator; |
| 1491 | + |
| 1492 | + /** |
| 1493 | + * Create a new SceneGraphHandler. |
| 1494 | + * <br> |
| 1495 | + * It's not clear what a callback would do with this, since there's no Annotation at the end of a SceneGraph call, so we just skip it |
| 1496 | + * @param callback The callback to call when annotation has finished. |
| 1497 | + */ |
| 1498 | + public SceneGraphHandler(Predicate<Properties> authenticator) { |
| 1499 | + this.authenticator = authenticator; |
| 1500 | + } |
| 1501 | + |
| 1502 | + @Override |
| 1503 | + public void handle(HttpExchange httpExchange) throws IOException { |
| 1504 | + if (onBlockList(httpExchange)) { |
| 1505 | + respondUnauthorized(httpExchange); |
| 1506 | + return; |
| 1507 | + } |
| 1508 | + setHttpExchangeResponseHeaders(httpExchange); |
| 1509 | + |
| 1510 | + Properties props = getProperties(httpExchange); |
| 1511 | + |
| 1512 | + if (authenticator != null && ! authenticator.test(props)) { |
| 1513 | + respondUnauthorized(httpExchange); |
| 1514 | + return; |
| 1515 | + } |
| 1516 | + Map<String, String> params = getURLParams(httpExchange.getRequestURI()); |
| 1517 | + |
| 1518 | + Future<Pair<String, SceneGraph>> response = corenlpExecutor.submit(() -> { |
| 1519 | + try { |
| 1520 | + // Get the document |
| 1521 | + String request = getSceneGraphRequest(props, httpExchange); |
| 1522 | + if (request == null || request.equals("")) { |
| 1523 | + respondBadInput("Blank input in scenegraph", httpExchange); |
| 1524 | + return Pair.makePair("", null); |
| 1525 | + } |
| 1526 | + RuleBasedParser parser = mkSceneGraphParser(); |
| 1527 | + |
| 1528 | + SceneGraph graph = parser.parse(request); |
| 1529 | + if (graph == null) { |
| 1530 | + respondError("Something weird happened and the text could not be parsed!", httpExchange); |
| 1531 | + } |
| 1532 | + return Pair.makePair(request, graph); |
| 1533 | + } catch (RuntimeException e) { |
| 1534 | + warn(e); |
| 1535 | + try { |
| 1536 | + respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange); |
| 1537 | + } catch (IOException ignored) { |
| 1538 | + } |
| 1539 | + } |
| 1540 | + return Pair.makePair("", null); |
| 1541 | + }); |
| 1542 | + |
| 1543 | + // Send response |
| 1544 | + try { |
| 1545 | + int timeout = getTimeout(props, httpExchange); |
| 1546 | + if (sceneParser == null) { |
| 1547 | + timeout = timeout + 60000; // add 60 seconds for loading a pipeline if needed |
| 1548 | + } |
| 1549 | + Pair<String, SceneGraph> pair = response.get(timeout, TimeUnit.MILLISECONDS); |
| 1550 | + SceneGraph graph = pair.second; |
| 1551 | + if (graph == null) { |
| 1552 | + // already responded with an error |
| 1553 | + return; |
| 1554 | + } |
| 1555 | + |
| 1556 | + final StanfordCoreNLP.OutputFormat of; |
| 1557 | + try { |
| 1558 | + of = StanfordCoreNLP.OutputFormat.valueOf(props.getProperty("outputFormat", "json").toUpperCase(Locale.ROOT)); |
| 1559 | + } catch (RuntimeException e) { |
| 1560 | + String badFormat = props.getProperty("outputFormat"); |
| 1561 | + log("Received bad output format in scenegraph '" + badFormat + "'"); |
| 1562 | + respondBadInput("Interface scenegraph does not handle output format '" + badFormat + "'", httpExchange); |
| 1563 | + return; |
| 1564 | + } |
| 1565 | + |
| 1566 | + final String result; |
| 1567 | + switch(of) { |
| 1568 | + case JSON: |
| 1569 | + int id = PropertiesUtils.getInt(props, "id", -1); |
| 1570 | + String url = props.getProperty("url", ""); |
| 1571 | + String phrase = pair.first; |
| 1572 | + result = graph.toJSON(id, url, phrase); |
| 1573 | + break; |
| 1574 | + case TEXT: |
| 1575 | + result = graph.toReadableString(); |
| 1576 | + break; |
| 1577 | + default: |
| 1578 | + log("Received unhanded output format in scenegraph '" + of + "'"); |
| 1579 | + respondBadInput("Interface scenegraph does not handle output format " + of, httpExchange); |
| 1580 | + return; |
| 1581 | + } |
| 1582 | + |
| 1583 | + byte[] content = result.getBytes(); |
| 1584 | + sendAndGetResponse(httpExchange, content); |
| 1585 | + } catch (InterruptedException | ExecutionException | TimeoutException e) { |
| 1586 | + respondError("Timeout when executing scenegraph query", httpExchange); |
| 1587 | + } |
| 1588 | + } |
| 1589 | + } |
| 1590 | + |
1407 | 1591 | private static void sendAndGetResponse(HttpExchange httpExchange, byte[] response) throws IOException {
|
1408 | 1592 | if (response.length > 0) {
|
1409 | 1593 | httpExchange.getResponseHeaders().add("Content-type", "application/json");
|
@@ -1547,6 +1731,7 @@ public void run(Optional<Pair<String,String>> basicAuth,
|
1547 | 1731 | withAuth(server.createContext(uriContext+"/tokensregex", new TokensRegexHandler(authenticator, callback)), basicAuth);
|
1548 | 1732 | withAuth(server.createContext(uriContext+"/semgrex", new SemgrexHandler(authenticator, callback)), basicAuth);
|
1549 | 1733 | withAuth(server.createContext(uriContext+"/tregex", new TregexHandler(authenticator, callback)), basicAuth);
|
| 1734 | + withAuth(server.createContext(uriContext+"/scenegraph", new SceneGraphHandler(authenticator)), basicAuth); |
1550 | 1735 | withAuth(server.createContext(uriContext+"/corenlp-brat.js", new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.js", "application/javascript")), basicAuth);
|
1551 | 1736 | withAuth(server.createContext(uriContext+"/corenlp-brat.cs", new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.css", "text/css")), basicAuth);
|
1552 | 1737 | withAuth(server.createContext(uriContext+"/corenlp-parseviewer.js", new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-parseviewer.js", "application/javascript")), basicAuth);
|
|
0 commit comments