001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.test.util.warc; 019 020import java.io.ByteArrayOutputStream; 021import java.io.DataInput; 022import java.io.DataOutput; 023import java.io.IOException; 024import java.util.LinkedHashMap; 025import java.util.Map; 026import java.util.regex.Pattern; 027 028/** 029 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing 030 * it out of a {@link DataInput} stream. The file format is documented in the [ISO 031 * Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). In a nutshell, it's 032 * a textual format consisting of lines delimited by `\r\n`. Each record has the following 033 * structure: 1. A line indicating the WARC version number, such as `WARC/1.0`. 2. Several header 034 * lines (in key-value format, similar to HTTP or email headers), giving information about the 035 * record. The header is terminated by an empty line. 3. A body consisting of raw bytes (the number 036 * of bytes is indicated in one of the headers). 4. A final separator of `\r\n\r\n` before the next 037 * record starts. There are various different types of records, as documented on 038 * {@link Header#getRecordType()}. 039 */ 040public class WARCRecord { 041 042 public static final String WARC_VERSION = "WARC/1.0"; 043 private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+"); 044 private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*"); 045 private static final String CRLF = "\r\n"; 046 private static final byte[] CRLF_BYTES = { 13, 10 }; 047 048 private final Header header; 049 private final byte[] content; 050 051 /** 052 * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream. 053 * @param in The input source from which one record will be read. 054 */ 055 public WARCRecord(DataInput in) throws IOException { 056 header = readHeader(in); 057 content = new byte[header.getContentLength()]; 058 in.readFully(content); 059 readSeparator(in); 060 } 061 062 private static Header readHeader(DataInput in) throws IOException { 063 String versionLine = readLine(in); 064 if (!VERSION_PATTERN.matcher(versionLine).matches()) { 065 throw new IllegalStateException("Expected WARC version, but got: " + versionLine); 066 } 067 068 LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>(); 069 String line, fieldName = null; 070 071 do { 072 line = readLine(in); 073 if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) { 074 headers.put(fieldName, headers.get(fieldName) + line); 075 } else if (!line.isEmpty()) { 076 String[] field = line.split(":", 2); 077 if (field.length < 2) { 078 throw new IllegalStateException("Malformed header line: " + line); 079 } 080 fieldName = field[0].trim(); 081 headers.put(fieldName, field[1].trim()); 082 } 083 } while (!line.isEmpty()); 084 085 return new Header(headers); 086 } 087 088 private static String readLine(DataInput in) throws IOException { 089 ByteArrayOutputStream out = new ByteArrayOutputStream(); 090 boolean seenCR = false, seenCRLF = false; 091 while (!seenCRLF) { 092 byte b = in.readByte(); 093 if (!seenCR && b == 13) { 094 seenCR = true; 095 } else if (seenCR && b == 10) { 096 seenCRLF = true; 097 } else { 098 seenCR = false; 099 out.write(b); 100 } 101 } 102 return out.toString("UTF-8"); 103 } 104 105 private static void readSeparator(DataInput in) throws IOException { 106 byte[] sep = new byte[4]; 107 in.readFully(sep); 108 if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) { 109 throw new IllegalStateException( 110 String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1], 111 sep[2], sep[3])); 112 } 113 } 114 115 /** 116 * Returns the parsed header structure of the WARC record. 117 */ 118 public Header getHeader() { 119 return header; 120 } 121 122 /** 123 * Returns the body of the record, as an unparsed raw array of bytes. The content of the body 124 * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of 125 * a `response` type header, the body consists of the full HTTP response returned by the server 126 * (HTTP headers followed by the body). 127 */ 128 public byte[] getContent() { 129 return content; 130 } 131 132 /** 133 * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not 134 * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same 135 * meaning and should not lose any information. 136 * @param out The output stream to which this record should be appended. 137 */ 138 public void write(DataOutput out) throws IOException { 139 header.write(out); 140 out.write(CRLF_BYTES); 141 out.write(content); 142 out.write(CRLF_BYTES); 143 out.write(CRLF_BYTES); 144 } 145 146 /** 147 * Returns a human-readable string representation of the record. 148 */ 149 @Override 150 public String toString() { 151 return header.toString(); 152 } 153 154 /** 155 * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers 156 * in key-value format, where some header keys are standardised, but nonstandard ones can be 157 * added. The documentation of the methods in this class is excerpted from the [WARC 1.0 158 * specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). Please see 159 * the specification for more detail. 160 */ 161 public final static class Header { 162 private final Map<String, String> fields; 163 164 private Header(Map<String, String> fields) { 165 this.fields = fields; 166 } 167 168 /** 169 * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines 170 * the following record types: (for full definitions, see the 171 * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf)) * `warcinfo`: 172 * Describes the records that follow it, up through end of file, end of input, or until next 173 * `warcinfo` record. Typically, this appears once and at the beginning of a WARC file. For a 174 * web archive, it often contains information about the web crawl which generated the following 175 * records. The format of this descriptive record block may vary, though the use of the 176 * `"application/warc-fields"` content-type is recommended. (...) * `response`: The record 177 * should contain a complete scheme-specific response, including network protocol information 178 * where possible. For a target-URI of the `http` or `https` schemes, a `response` record block 179 * should contain the full HTTP response received over the network, including headers. That is, 180 * it contains the 'Response' message defined by section 6 of HTTP/1.1 (RFC2616). The WARC 181 * record's Content-Type field should contain the value defined by HTTP/1.1, 182 * `"application/http;msgtype=response"`. The payload of the record is defined as its 183 * 'entity-body' (per RFC2616), with any transfer-encoding removed. * `resource`: The record 184 * contains a resource, without full protocol response information. For example: a file directly 185 * retrieved from a locally accessible repository or the result of a networked retrieval where 186 * the protocol information has been discarded. For a target-URI of the `http` or `https` 187 * schemes, a `resource` record block shall contain the returned 'entity-body' (per RFC2616, 188 * with any transfer-encodings removed), possibly truncated. * `request`: The record holds the 189 * details of a complete scheme-specific request, including network protocol information where 190 * possible. For a target-URI of the `http` or `https` schemes, a `request` record block should 191 * contain the full HTTP request sent over the network, including headers. That is, it contains 192 * the 'Request' message defined by section 5 of HTTP/1.1 (RFC2616). The WARC record's 193 * Content-Type field should contain the value defined by HTTP/1.1, 194 * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of 195 * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any 196 * transfer-encoding removed. * `metadata`: The record contains content created in order to 197 * further describe, explain, or accompany a harvested resource, in ways not covered by other 198 * record types. A `metadata` record will almost always refer to another record of another type, 199 * with that other record holding original harvested or transformed content. The format of the 200 * metadata record block may vary. The `"application/warc-fields"` format may be used. * 201 * `revisit`: The record describes the revisitation of content already archived, and might 202 * include only an abbreviated content body which has to be interpreted relative to a previous 203 * record. Most typically, a `revisit` record is used instead of a `response` or `resource` 204 * record to indicate that the content visited was either a complete or substantial duplicate of 205 * material previously archived. A `revisit` record shall contain a WARC-Profile field which 206 * determines the interpretation of the record's fields and record block. Please see the 207 * specification for details. * `conversion`: The record shall contain an alternative version of 208 * another record's content that was created as the result of an archival process. Typically, 209 * this is used to hold content transformations that maintain viability of content after widely 210 * available rendering tools for the originally stored format disappear. As needed, the original 211 * content may be migrated (transformed) to a more viable format in order to keep the 212 * information usable with current tools while minimizing loss of information. * `continuation`: 213 * Record blocks from `continuation` records must be appended to corresponding prior record 214 * blocks (eg. from other WARC files) to create the logically complete full-sized original 215 * record. That is, `continuation` records are used when a record that would otherwise cause a 216 * WARC file size to exceed a desired limit is broken into segments. A continuation record shall 217 * contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`, and the last 218 * `continuation` record of a series shall contain a `WARC-Segment-Total-Length` field. Please 219 * see the specification for details. * Other record types may be added in future, so this list 220 * is not exclusive. 221 * @return The record's `WARC-Type` header field, as a string. 222 */ 223 public String getRecordType() { 224 return fields.get("WARC-Type"); 225 } 226 227 /** 228 * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C 229 * profile of ISO8601. The timestamp shall represent the instant that data capture for record 230 * creation began. Multiple records written as part of a single capture event shall use the same 231 * WARC-Date, even though the times of their writing will not be exactly synchronized. 232 * @return The record's `WARC-Date` header field, as a string. 233 */ 234 public String getDateString() { 235 return fields.get("WARC-Date"); 236 } 237 238 /** 239 * An identifier assigned to the current record that is globally unique for its period of 240 * intended use. No identifier scheme is mandated by this specification, but each record-id 241 * shall be a legal URI and clearly indicate a documented and registered scheme to which it 242 * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`). 243 * @return The record's `WARC-Record-ID` header field, as a string. 244 */ 245 public String getRecordID() { 246 return fields.get("WARC-Record-ID"); 247 } 248 249 /** 250 * The MIME type (RFC2045) of the information contained in the record's block. For example, in 251 * HTTP request and response records, this would be `application/http` as per section 19.1 of 252 * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response` 253 * respectively). In particular, the content-type is *not* the value of the HTTP Content-Type 254 * header in an HTTP response, but a MIME type to describe the full archived HTTP message (hence 255 * `application/http` if the block contains request or response headers). 256 * @return The record's `Content-Type` header field, as a string. 257 */ 258 public String getContentType() { 259 return fields.get("Content-Type"); 260 } 261 262 /** 263 * The original URI whose capture gave rise to the information content in this record. In the 264 * context of web harvesting, this is the URI that was the target of a crawler's retrieval 265 * request. For a `revisit` record, it is the URI that was the target of a retrieval request. 266 * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the 267 * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The 268 * URI in this value shall be properly escaped according to RFC3986, and written with no 269 * internal whitespace. 270 * @return The record's `WARC-Target-URI` header field, as a string. 271 */ 272 public String getTargetURI() { 273 return fields.get("WARC-Target-URI"); 274 } 275 276 /** 277 * The number of bytes in the body of the record, similar to RFC2616. 278 * @return The record's `Content-Length` header field, parsed into an int. 279 */ 280 public int getContentLength() { 281 String lengthStr = fields.get("Content-Length"); 282 if (lengthStr == null) { 283 throw new IllegalStateException("Missing Content-Length header"); 284 } 285 try { 286 return Integer.parseInt(lengthStr); 287 } catch (NumberFormatException e) { 288 throw new IllegalStateException("Malformed Content-Length header: " + lengthStr); 289 } 290 } 291 292 /** 293 * Returns the value of a selected header field, or null if there is no header with that field 294 * name. 295 * @param field The name of the header to return (case-sensitive). 296 * @return The value associated with that field name, or null if not present. 297 */ 298 public String getField(String field) { 299 return fields.get(field); 300 } 301 302 /** 303 * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format. 304 * @param out The data output to which the header should be written. 305 */ 306 public void write(DataOutput out) throws IOException { 307 out.write(toString().getBytes("UTF-8")); 308 } 309 310 /** 311 * Formats this header in WARC/1.0 format, consisting of a version line followed by 312 * colon-delimited key-value pairs, and `\r\n` line endings. 313 */ 314 @Override 315 public String toString() { 316 StringBuilder buf = new StringBuilder(); 317 buf.append(WARC_VERSION); 318 buf.append(CRLF); 319 for (Map.Entry<String, String> field : fields.entrySet()) { 320 buf.append(field.getKey()); 321 buf.append(": "); 322 buf.append(field.getValue()); 323 buf.append(CRLF); 324 } 325 return buf.toString(); 326 } 327 } 328 329}