|
27 | 27 | import java.util.ArrayList; |
28 | 28 | import java.util.Arrays; |
29 | 29 | import java.util.Iterator; |
| 30 | +import java.util.regex.Matcher; |
| 31 | +import java.util.regex.Pattern; |
30 | 32 |
|
31 | 33 | public class RequestParser |
32 | 34 | { |
@@ -478,4 +480,46 @@ public static ArrayList<HttpCookie> getCookiesFromHeaders(ArrayList<String> head |
478 | 480 |
|
479 | 481 | return null; |
480 | 482 | } |
| 483 | + |
| 484 | + /** |
| 485 | + * extract the charset encoding from the HTTP response headers. |
| 486 | + * |
| 487 | + * @param contentType content-type header to be parsed |
| 488 | + * @return returns the charset encoding if we've found it, or null. |
| 489 | + */ |
| 490 | + public static String getCharsetFromHeaders(String contentType){ |
| 491 | + if (contentType != null && contentType.toLowerCase().trim().contains("charset=")){ |
| 492 | + String[] parts = contentType.toLowerCase().trim().split("="); |
| 493 | + if (parts.length > 0) |
| 494 | + return parts[1]; |
| 495 | + } |
| 496 | + |
| 497 | + return null; |
| 498 | + } |
| 499 | + |
| 500 | + /** |
| 501 | + * extract the charset encoding of a web site from the <meta> headers. |
| 502 | + * |
| 503 | + * @param body html body of the site to be parsed |
| 504 | + * @return returns the charset encoding if we've found it, or null. |
| 505 | + */ |
| 506 | + public static String getCharsetFromBody(String body) { |
| 507 | + if (body != null) { |
| 508 | + // match <body>, <body onLoad="">, etc... |
| 509 | + int headEnd = body.toLowerCase().indexOf("</head>"); |
| 510 | + |
| 511 | + // return null if there's no head tags |
| 512 | + if (headEnd == -1) |
| 513 | + return null; |
| 514 | + |
| 515 | + String body_head = body.toLowerCase().substring(0, headEnd); |
| 516 | + |
| 517 | + Pattern p = Pattern.compile("charset=([\"a-z0-9A-Z-]+)"); |
| 518 | + Matcher m = p.matcher(body_head); |
| 519 | + if (m.find()) |
| 520 | + return m.toMatchResult().group(1).replaceAll("\"", ""); |
| 521 | + } |
| 522 | + |
| 523 | + return null; |
| 524 | + } |
481 | 525 | } |
0 commit comments