Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018/*
019 * The MIT License (MIT)
020 * Copyright (c) 2014 Martin Kleppmann
021 *
022 * Permission is hereby granted, free of charge, to any person obtaining a copy
023 * of this software and associated documentation files (the "Software"), to deal
024 * in the Software without restriction, including without limitation the rights
025 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
026 * copies of the Software, and to permit persons to whom the Software is
027 * furnished to do so, subject to the following conditions:
028 *
029 * The above copyright notice and this permission notice shall be included in
030 * all copies or substantial portions of the Software.
031 *
032 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
033 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
034 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
035 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
036 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
037 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
038 * THE SOFTWARE.
039 */
040package org.apache.hadoop.hbase.test.util.warc;
041
042import java.io.ByteArrayOutputStream;
043import java.io.DataInput;
044import java.io.DataOutput;
045import java.io.IOException;
046import java.util.LinkedHashMap;
047import java.util.Map;
048import java.util.regex.Pattern;
049
050/**
051 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing
052 * it out of a {@link DataInput} stream. The file format is documented in the [ISO
053 * Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). In a nutshell, it's
054 * a textual format consisting of lines delimited by `\r\n`. Each record has the following
055 * structure: 1. A line indicating the WARC version number, such as `WARC/1.0`. 2. Several header
056 * lines (in key-value format, similar to HTTP or email headers), giving information about the
057 * record. The header is terminated by an empty line. 3. A body consisting of raw bytes (the number
058 * of bytes is indicated in one of the headers). 4. A final separator of `\r\n\r\n` before the next
059 * record starts. There are various different types of records, as documented on
060 * {@link Header#getRecordType()}.
061 */
062public class WARCRecord {
063
064  public static final String WARC_VERSION = "WARC/1.0";
065  private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+");
066  private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*");
067  private static final String CRLF = "\r\n";
068  private static final byte[] CRLF_BYTES = { 13, 10 };
069
070  private final Header header;
071  private final byte[] content;
072
073  /**
074   * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream.
075   * @param in The input source from which one record will be read.
076   */
077  public WARCRecord(DataInput in) throws IOException {
078    header = readHeader(in);
079    content = new byte[header.getContentLength()];
080    in.readFully(content);
081    readSeparator(in);
082  }
083
084  private static Header readHeader(DataInput in) throws IOException {
085    String versionLine = readLine(in);
086    if (!VERSION_PATTERN.matcher(versionLine).matches()) {
087      throw new IllegalStateException("Expected WARC version, but got: " + versionLine);
088    }
089
090    LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>();
091    String line, fieldName = null;
092
093    do {
094      line = readLine(in);
095      if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) {
096        headers.put(fieldName, headers.get(fieldName) + line);
097      } else if (!line.isEmpty()) {
098        String[] field = line.split(":", 2);
099        if (field.length < 2) {
100          throw new IllegalStateException("Malformed header line: " + line);
101        }
102        fieldName = field[0].trim();
103        headers.put(fieldName, field[1].trim());
104      }
105    } while (!line.isEmpty());
106
107    return new Header(headers);
108  }
109
110  private static String readLine(DataInput in) throws IOException {
111    ByteArrayOutputStream out = new ByteArrayOutputStream();
112    boolean seenCR = false, seenCRLF = false;
113    while (!seenCRLF) {
114      byte b = in.readByte();
115      if (!seenCR && b == 13) {
116        seenCR = true;
117      } else if (seenCR && b == 10) {
118        seenCRLF = true;
119      } else {
120        seenCR = false;
121        out.write(b);
122      }
123    }
124    return out.toString("UTF-8");
125  }
126
127  private static void readSeparator(DataInput in) throws IOException {
128    byte[] sep = new byte[4];
129    in.readFully(sep);
130    if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) {
131      throw new IllegalStateException(
132        String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1],
133          sep[2], sep[3]));
134    }
135  }
136
137  /**
138   * Returns the parsed header structure of the WARC record.
139   */
140  public Header getHeader() {
141    return header;
142  }
143
144  /**
145   * Returns the body of the record, as an unparsed raw array of bytes. The content of the body
146   * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of
147   * a `response` type header, the body consists of the full HTTP response returned by the server
148   * (HTTP headers followed by the body).
149   */
150  public byte[] getContent() {
151    return content;
152  }
153
154  /**
155   * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not
156   * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same
157   * meaning and should not lose any information.
158   * @param out The output stream to which this record should be appended.
159   */
160  public void write(DataOutput out) throws IOException {
161    header.write(out);
162    out.write(CRLF_BYTES);
163    out.write(content);
164    out.write(CRLF_BYTES);
165    out.write(CRLF_BYTES);
166  }
167
168  /**
169   * Returns a human-readable string representation of the record.
170   */
171  @Override
172  public String toString() {
173    return header.toString();
174  }
175
176  /**
177   * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers
178   * in key-value format, where some header keys are standardised, but nonstandard ones can be
179   * added. The documentation of the methods in this class is excerpted from the [WARC 1.0
180   * specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). Please see
181   * the specification for more detail.
182   */
183  public final static class Header {
184    private final Map<String, String> fields;
185
186    private Header(Map<String, String> fields) {
187      this.fields = fields;
188    }
189
190    /**
191     * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines
192     * the following record types: (for full definitions, see the
193     * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf)) * `warcinfo`:
194     * Describes the records that follow it, up through end of file, end of input, or until next
195     * `warcinfo` record. Typically, this appears once and at the beginning of a WARC file. For a
196     * web archive, it often contains information about the web crawl which generated the following
197     * records. The format of this descriptive record block may vary, though the use of the
198     * `"application/warc-fields"` content-type is recommended. (...) * `response`: The record
199     * should contain a complete scheme-specific response, including network protocol information
200     * where possible. For a target-URI of the `http` or `https` schemes, a `response` record block
201     * should contain the full HTTP response received over the network, including headers. That is,
202     * it contains the 'Response' message defined by section 6 of HTTP/1.1 (RFC2616). The WARC
203     * record's Content-Type field should contain the value defined by HTTP/1.1,
204     * `"application/http;msgtype=response"`. The payload of the record is defined as its
205     * 'entity-body' (per RFC2616), with any transfer-encoding removed. * `resource`: The record
206     * contains a resource, without full protocol response information. For example: a file directly
207     * retrieved from a locally accessible repository or the result of a networked retrieval where
208     * the protocol information has been discarded. For a target-URI of the `http` or `https`
209     * schemes, a `resource` record block shall contain the returned 'entity-body' (per RFC2616,
210     * with any transfer-encodings removed), possibly truncated. * `request`: The record holds the
211     * details of a complete scheme-specific request, including network protocol information where
212     * possible. For a target-URI of the `http` or `https` schemes, a `request` record block should
213     * contain the full HTTP request sent over the network, including headers. That is, it contains
214     * the 'Request' message defined by section 5 of HTTP/1.1 (RFC2616). The WARC record's
215     * Content-Type field should contain the value defined by HTTP/1.1,
216     * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of
217     * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any
218     * transfer-encoding removed. * `metadata`: The record contains content created in order to
219     * further describe, explain, or accompany a harvested resource, in ways not covered by other
220     * record types. A `metadata` record will almost always refer to another record of another type,
221     * with that other record holding original harvested or transformed content. The format of the
222     * metadata record block may vary. The `"application/warc-fields"` format may be used. *
223     * `revisit`: The record describes the revisitation of content already archived, and might
224     * include only an abbreviated content body which has to be interpreted relative to a previous
225     * record. Most typically, a `revisit` record is used instead of a `response` or `resource`
226     * record to indicate that the content visited was either a complete or substantial duplicate of
227     * material previously archived. A `revisit` record shall contain a WARC-Profile field which
228     * determines the interpretation of the record's fields and record block. Please see the
229     * specification for details. * `conversion`: The record shall contain an alternative version of
230     * another record's content that was created as the result of an archival process. Typically,
231     * this is used to hold content transformations that maintain viability of content after widely
232     * available rendering tools for the originally stored format disappear. As needed, the original
233     * content may be migrated (transformed) to a more viable format in order to keep the
234     * information usable with current tools while minimizing loss of information. * `continuation`:
235     * Record blocks from `continuation` records must be appended to corresponding prior record
236     * blocks (eg. from other WARC files) to create the logically complete full-sized original
237     * record. That is, `continuation` records are used when a record that would otherwise cause a
238     * WARC file size to exceed a desired limit is broken into segments. A continuation record shall
239     * contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`, and the last
240     * `continuation` record of a series shall contain a `WARC-Segment-Total-Length` field. Please
241     * see the specification for details. * Other record types may be added in future, so this list
242     * is not exclusive.
243     * @return The record's `WARC-Type` header field, as a string.
244     */
245    public String getRecordType() {
246      return fields.get("WARC-Type");
247    }
248
249    /**
250     * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C
251     * profile of ISO8601. The timestamp shall represent the instant that data capture for record
252     * creation began. Multiple records written as part of a single capture event shall use the same
253     * WARC-Date, even though the times of their writing will not be exactly synchronized.
254     * @return The record's `WARC-Date` header field, as a string.
255     */
256    public String getDateString() {
257      return fields.get("WARC-Date");
258    }
259
260    /**
261     * An identifier assigned to the current record that is globally unique for its period of
262     * intended use. No identifier scheme is mandated by this specification, but each record-id
263     * shall be a legal URI and clearly indicate a documented and registered scheme to which it
264     * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`).
265     * @return The record's `WARC-Record-ID` header field, as a string.
266     */
267    public String getRecordID() {
268      return fields.get("WARC-Record-ID");
269    }
270
271    /**
272     * The MIME type (RFC2045) of the information contained in the record's block. For example, in
273     * HTTP request and response records, this would be `application/http` as per section 19.1 of
274     * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response`
275     * respectively). In particular, the content-type is *not* the value of the HTTP Content-Type
276     * header in an HTTP response, but a MIME type to describe the full archived HTTP message (hence
277     * `application/http` if the block contains request or response headers).
278     * @return The record's `Content-Type` header field, as a string.
279     */
280    public String getContentType() {
281      return fields.get("Content-Type");
282    }
283
284    /**
285     * The original URI whose capture gave rise to the information content in this record. In the
286     * context of web harvesting, this is the URI that was the target of a crawler's retrieval
287     * request. For a `revisit` record, it is the URI that was the target of a retrieval request.
288     * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the
289     * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The
290     * URI in this value shall be properly escaped according to RFC3986, and written with no
291     * internal whitespace.
292     * @return The record's `WARC-Target-URI` header field, as a string.
293     */
294    public String getTargetURI() {
295      return fields.get("WARC-Target-URI");
296    }
297
298    /**
299     * The number of bytes in the body of the record, similar to RFC2616.
300     * @return The record's `Content-Length` header field, parsed into an int.
301     */
302    public int getContentLength() {
303      String lengthStr = fields.get("Content-Length");
304      if (lengthStr == null) {
305        throw new IllegalStateException("Missing Content-Length header");
306      }
307      try {
308        return Integer.parseInt(lengthStr);
309      } catch (NumberFormatException e) {
310        throw new IllegalStateException("Malformed Content-Length header: " + lengthStr);
311      }
312    }
313
314    /**
315     * Returns the value of a selected header field, or null if there is no header with that field
316     * name.
317     * @param field The name of the header to return (case-sensitive).
318     * @return The value associated with that field name, or null if not present.
319     */
320    public String getField(String field) {
321      return fields.get(field);
322    }
323
324    /**
325     * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format.
326     * @param out The data output to which the header should be written.
327     */
328    public void write(DataOutput out) throws IOException {
329      out.write(toString().getBytes("UTF-8"));
330    }
331
332    /**
333     * Formats this header in WARC/1.0 format, consisting of a version line followed by
334     * colon-delimited key-value pairs, and `\r\n` line endings.
335     */
336    @Override
337    public String toString() {
338      StringBuilder buf = new StringBuilder();
339      buf.append(WARC_VERSION);
340      buf.append(CRLF);
341      for (Map.Entry<String, String> field : fields.entrySet()) {
342        buf.append(field.getKey());
343        buf.append(": ");
344        buf.append(field.getValue());
345        buf.append(CRLF);
346      }
347      return buf.toString();
348    }
349  }
350
351}