001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018/* 019 * The MIT License (MIT) 020 * Copyright (c) 2014 Martin Kleppmann 021 * 022 * Permission is hereby granted, free of charge, to any person obtaining a copy 023 * of this software and associated documentation files (the "Software"), to deal 024 * in the Software without restriction, including without limitation the rights 025 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 026 * copies of the Software, and to permit persons to whom the Software is 027 * furnished to do so, subject to the following conditions: 028 * 029 * The above copyright notice and this permission notice shall be included in 030 * all copies or substantial portions of the Software. 031 * 032 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 033 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 034 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 035 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 036 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 037 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 038 * THE SOFTWARE. 039 */ 040package org.apache.hadoop.hbase.test.util.warc; 041 042import java.io.ByteArrayOutputStream; 043import java.io.DataInput; 044import java.io.DataOutput; 045import java.io.IOException; 046import java.util.LinkedHashMap; 047import java.util.Map; 048import java.util.regex.Pattern; 049 050/** 051 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing 052 * it out of a {@link DataInput} stream. The file format is documented in the [ISO 053 * Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). In a nutshell, it's 054 * a textual format consisting of lines delimited by `\r\n`. Each record has the following 055 * structure: 1. A line indicating the WARC version number, such as `WARC/1.0`. 2. Several header 056 * lines (in key-value format, similar to HTTP or email headers), giving information about the 057 * record. The header is terminated by an empty line. 3. A body consisting of raw bytes (the number 058 * of bytes is indicated in one of the headers). 4. A final separator of `\r\n\r\n` before the next 059 * record starts. There are various different types of records, as documented on 060 * {@link Header#getRecordType()}. 061 */ 062public class WARCRecord { 063 064 public static final String WARC_VERSION = "WARC/1.0"; 065 private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+"); 066 private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*"); 067 private static final String CRLF = "\r\n"; 068 private static final byte[] CRLF_BYTES = { 13, 10 }; 069 070 private final Header header; 071 private final byte[] content; 072 073 /** 074 * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream. 075 * @param in The input source from which one record will be read. 076 */ 077 public WARCRecord(DataInput in) throws IOException { 078 header = readHeader(in); 079 content = new byte[header.getContentLength()]; 080 in.readFully(content); 081 readSeparator(in); 082 } 083 084 private static Header readHeader(DataInput in) throws IOException { 085 String versionLine = readLine(in); 086 if (!VERSION_PATTERN.matcher(versionLine).matches()) { 087 throw new IllegalStateException("Expected WARC version, but got: " + versionLine); 088 } 089 090 LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>(); 091 String line, fieldName = null; 092 093 do { 094 line = readLine(in); 095 if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) { 096 headers.put(fieldName, headers.get(fieldName) + line); 097 } else if (!line.isEmpty()) { 098 String[] field = line.split(":", 2); 099 if (field.length < 2) { 100 throw new IllegalStateException("Malformed header line: " + line); 101 } 102 fieldName = field[0].trim(); 103 headers.put(fieldName, field[1].trim()); 104 } 105 } while (!line.isEmpty()); 106 107 return new Header(headers); 108 } 109 110 private static String readLine(DataInput in) throws IOException { 111 ByteArrayOutputStream out = new ByteArrayOutputStream(); 112 boolean seenCR = false, seenCRLF = false; 113 while (!seenCRLF) { 114 byte b = in.readByte(); 115 if (!seenCR && b == 13) { 116 seenCR = true; 117 } else if (seenCR && b == 10) { 118 seenCRLF = true; 119 } else { 120 seenCR = false; 121 out.write(b); 122 } 123 } 124 return out.toString("UTF-8"); 125 } 126 127 private static void readSeparator(DataInput in) throws IOException { 128 byte[] sep = new byte[4]; 129 in.readFully(sep); 130 if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) { 131 throw new IllegalStateException( 132 String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1], 133 sep[2], sep[3])); 134 } 135 } 136 137 /** 138 * Returns the parsed header structure of the WARC record. 139 */ 140 public Header getHeader() { 141 return header; 142 } 143 144 /** 145 * Returns the body of the record, as an unparsed raw array of bytes. The content of the body 146 * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of 147 * a `response` type header, the body consists of the full HTTP response returned by the server 148 * (HTTP headers followed by the body). 149 */ 150 public byte[] getContent() { 151 return content; 152 } 153 154 /** 155 * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not 156 * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same 157 * meaning and should not lose any information. 158 * @param out The output stream to which this record should be appended. 159 */ 160 public void write(DataOutput out) throws IOException { 161 header.write(out); 162 out.write(CRLF_BYTES); 163 out.write(content); 164 out.write(CRLF_BYTES); 165 out.write(CRLF_BYTES); 166 } 167 168 /** 169 * Returns a human-readable string representation of the record. 170 */ 171 @Override 172 public String toString() { 173 return header.toString(); 174 } 175 176 /** 177 * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers 178 * in key-value format, where some header keys are standardised, but nonstandard ones can be 179 * added. The documentation of the methods in this class is excerpted from the [WARC 1.0 180 * specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). Please see 181 * the specification for more detail. 182 */ 183 public final static class Header { 184 private final Map<String, String> fields; 185 186 private Header(Map<String, String> fields) { 187 this.fields = fields; 188 } 189 190 /** 191 * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines 192 * the following record types: (for full definitions, see the 193 * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf)) * `warcinfo`: 194 * Describes the records that follow it, up through end of file, end of input, or until next 195 * `warcinfo` record. Typically, this appears once and at the beginning of a WARC file. For a 196 * web archive, it often contains information about the web crawl which generated the following 197 * records. The format of this descriptive record block may vary, though the use of the 198 * `"application/warc-fields"` content-type is recommended. (...) * `response`: The record 199 * should contain a complete scheme-specific response, including network protocol information 200 * where possible. For a target-URI of the `http` or `https` schemes, a `response` record block 201 * should contain the full HTTP response received over the network, including headers. That is, 202 * it contains the 'Response' message defined by section 6 of HTTP/1.1 (RFC2616). The WARC 203 * record's Content-Type field should contain the value defined by HTTP/1.1, 204 * `"application/http;msgtype=response"`. The payload of the record is defined as its 205 * 'entity-body' (per RFC2616), with any transfer-encoding removed. * `resource`: The record 206 * contains a resource, without full protocol response information. For example: a file directly 207 * retrieved from a locally accessible repository or the result of a networked retrieval where 208 * the protocol information has been discarded. For a target-URI of the `http` or `https` 209 * schemes, a `resource` record block shall contain the returned 'entity-body' (per RFC2616, 210 * with any transfer-encodings removed), possibly truncated. * `request`: The record holds the 211 * details of a complete scheme-specific request, including network protocol information where 212 * possible. For a target-URI of the `http` or `https` schemes, a `request` record block should 213 * contain the full HTTP request sent over the network, including headers. That is, it contains 214 * the 'Request' message defined by section 5 of HTTP/1.1 (RFC2616). The WARC record's 215 * Content-Type field should contain the value defined by HTTP/1.1, 216 * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of 217 * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any 218 * transfer-encoding removed. * `metadata`: The record contains content created in order to 219 * further describe, explain, or accompany a harvested resource, in ways not covered by other 220 * record types. A `metadata` record will almost always refer to another record of another type, 221 * with that other record holding original harvested or transformed content. The format of the 222 * metadata record block may vary. The `"application/warc-fields"` format may be used. * 223 * `revisit`: The record describes the revisitation of content already archived, and might 224 * include only an abbreviated content body which has to be interpreted relative to a previous 225 * record. Most typically, a `revisit` record is used instead of a `response` or `resource` 226 * record to indicate that the content visited was either a complete or substantial duplicate of 227 * material previously archived. A `revisit` record shall contain a WARC-Profile field which 228 * determines the interpretation of the record's fields and record block. Please see the 229 * specification for details. * `conversion`: The record shall contain an alternative version of 230 * another record's content that was created as the result of an archival process. Typically, 231 * this is used to hold content transformations that maintain viability of content after widely 232 * available rendering tools for the originally stored format disappear. As needed, the original 233 * content may be migrated (transformed) to a more viable format in order to keep the 234 * information usable with current tools while minimizing loss of information. * `continuation`: 235 * Record blocks from `continuation` records must be appended to corresponding prior record 236 * blocks (eg. from other WARC files) to create the logically complete full-sized original 237 * record. That is, `continuation` records are used when a record that would otherwise cause a 238 * WARC file size to exceed a desired limit is broken into segments. A continuation record shall 239 * contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`, and the last 240 * `continuation` record of a series shall contain a `WARC-Segment-Total-Length` field. Please 241 * see the specification for details. * Other record types may be added in future, so this list 242 * is not exclusive. 243 * @return The record's `WARC-Type` header field, as a string. 244 */ 245 public String getRecordType() { 246 return fields.get("WARC-Type"); 247 } 248 249 /** 250 * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C 251 * profile of ISO8601. The timestamp shall represent the instant that data capture for record 252 * creation began. Multiple records written as part of a single capture event shall use the same 253 * WARC-Date, even though the times of their writing will not be exactly synchronized. 254 * @return The record's `WARC-Date` header field, as a string. 255 */ 256 public String getDateString() { 257 return fields.get("WARC-Date"); 258 } 259 260 /** 261 * An identifier assigned to the current record that is globally unique for its period of 262 * intended use. No identifier scheme is mandated by this specification, but each record-id 263 * shall be a legal URI and clearly indicate a documented and registered scheme to which it 264 * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`). 265 * @return The record's `WARC-Record-ID` header field, as a string. 266 */ 267 public String getRecordID() { 268 return fields.get("WARC-Record-ID"); 269 } 270 271 /** 272 * The MIME type (RFC2045) of the information contained in the record's block. For example, in 273 * HTTP request and response records, this would be `application/http` as per section 19.1 of 274 * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response` 275 * respectively). In particular, the content-type is *not* the value of the HTTP Content-Type 276 * header in an HTTP response, but a MIME type to describe the full archived HTTP message (hence 277 * `application/http` if the block contains request or response headers). 278 * @return The record's `Content-Type` header field, as a string. 279 */ 280 public String getContentType() { 281 return fields.get("Content-Type"); 282 } 283 284 /** 285 * The original URI whose capture gave rise to the information content in this record. In the 286 * context of web harvesting, this is the URI that was the target of a crawler's retrieval 287 * request. For a `revisit` record, it is the URI that was the target of a retrieval request. 288 * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the 289 * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The 290 * URI in this value shall be properly escaped according to RFC3986, and written with no 291 * internal whitespace. 292 * @return The record's `WARC-Target-URI` header field, as a string. 293 */ 294 public String getTargetURI() { 295 return fields.get("WARC-Target-URI"); 296 } 297 298 /** 299 * The number of bytes in the body of the record, similar to RFC2616. 300 * @return The record's `Content-Length` header field, parsed into an int. 301 */ 302 public int getContentLength() { 303 String lengthStr = fields.get("Content-Length"); 304 if (lengthStr == null) { 305 throw new IllegalStateException("Missing Content-Length header"); 306 } 307 try { 308 return Integer.parseInt(lengthStr); 309 } catch (NumberFormatException e) { 310 throw new IllegalStateException("Malformed Content-Length header: " + lengthStr); 311 } 312 } 313 314 /** 315 * Returns the value of a selected header field, or null if there is no header with that field 316 * name. 317 * @param field The name of the header to return (case-sensitive). 318 * @return The value associated with that field name, or null if not present. 319 */ 320 public String getField(String field) { 321 return fields.get(field); 322 } 323 324 /** 325 * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format. 326 * @param out The data output to which the header should be written. 327 */ 328 public void write(DataOutput out) throws IOException { 329 out.write(toString().getBytes("UTF-8")); 330 } 331 332 /** 333 * Formats this header in WARC/1.0 format, consisting of a version line followed by 334 * colon-delimited key-value pairs, and `\r\n` line endings. 335 */ 336 @Override 337 public String toString() { 338 StringBuilder buf = new StringBuilder(); 339 buf.append(WARC_VERSION); 340 buf.append(CRLF); 341 for (Map.Entry<String, String> field : fields.entrySet()) { 342 buf.append(field.getKey()); 343 buf.append(": "); 344 buf.append(field.getValue()); 345 buf.append(CRLF); 346 } 347 return buf.toString(); 348 } 349 } 350 351}