Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018/*
019 * The MIT License (MIT)
020 * Copyright (c) 2014 Martin Kleppmann
021 *
022 * Permission is hereby granted, free of charge, to any person obtaining a copy
023 * of this software and associated documentation files (the "Software"), to deal
024 * in the Software without restriction, including without limitation the rights
025 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
026 * copies of the Software, and to permit persons to whom the Software is
027 * furnished to do so, subject to the following conditions:
028 *
029 * The above copyright notice and this permission notice shall be included in
030 * all copies or substantial portions of the Software.
031 *
032 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
033 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
034 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
035 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
036 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
037 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
038 * THE SOFTWARE.
039 */
040package org.apache.hadoop.hbase.test.util.warc;
041
042import java.io.ByteArrayOutputStream;
043import java.io.DataInput;
044import java.io.DataOutput;
045import java.io.IOException;
046import java.util.LinkedHashMap;
047import java.util.Map;
048import java.util.regex.Pattern;
049
050/**
051 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing
052 * it out of a {@link DataInput} stream.
053 * <p/>
054 * The file format is documented in the
055 * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">ISO Standard</a>. In
056 * a nutshell, it's a textual format consisting of lines delimited by `\r\n`. Each record has the
057 * following structure:
058 * <ol>
059 * <li>A line indicating the WARC version number, such as `WARC/1.0`.</li>
060 * <li>Several header lines (in key-value format, similar to HTTP or email headers), giving
061 * information about the record. The header is terminated by an empty line.
062 * <li>A body consisting of raw bytes (the number of bytes is indicated in one of the headers).
063 * <li>A final separator of `\r\n\r\n` before the next record starts.
064 * </ol>
065 * There are various different types of records, as documented on {@link Header#getRecordType()}.
066 */
067public class WARCRecord {
068
069  public static final String WARC_VERSION = "WARC/1.0";
070  private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+");
071  private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*");
072  private static final String CRLF = "\r\n";
073  private static final byte[] CRLF_BYTES = { 13, 10 };
074
075  private final Header header;
076  private final byte[] content;
077
078  /**
079   * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream.
080   * @param in The input source from which one record will be read.
081   */
082  public WARCRecord(DataInput in) throws IOException {
083    header = readHeader(in);
084    content = new byte[header.getContentLength()];
085    in.readFully(content);
086    readSeparator(in);
087  }
088
089  private static Header readHeader(DataInput in) throws IOException {
090    String versionLine = readLine(in);
091    if (!VERSION_PATTERN.matcher(versionLine).matches()) {
092      throw new IllegalStateException("Expected WARC version, but got: " + versionLine);
093    }
094
095    LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>();
096    String line, fieldName = null;
097
098    do {
099      line = readLine(in);
100      if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) {
101        headers.put(fieldName, headers.get(fieldName) + line);
102      } else if (!line.isEmpty()) {
103        String[] field = line.split(":", 2);
104        if (field.length < 2) {
105          throw new IllegalStateException("Malformed header line: " + line);
106        }
107        fieldName = field[0].trim();
108        headers.put(fieldName, field[1].trim());
109      }
110    } while (!line.isEmpty());
111
112    return new Header(headers);
113  }
114
115  private static String readLine(DataInput in) throws IOException {
116    ByteArrayOutputStream out = new ByteArrayOutputStream();
117    boolean seenCR = false, seenCRLF = false;
118    while (!seenCRLF) {
119      byte b = in.readByte();
120      if (!seenCR && b == 13) {
121        seenCR = true;
122      } else if (seenCR && b == 10) {
123        seenCRLF = true;
124      } else {
125        seenCR = false;
126        out.write(b);
127      }
128    }
129    return out.toString("UTF-8");
130  }
131
132  private static void readSeparator(DataInput in) throws IOException {
133    byte[] sep = new byte[4];
134    in.readFully(sep);
135    if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) {
136      throw new IllegalStateException(
137        String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1],
138          sep[2], sep[3]));
139    }
140  }
141
142  /**
143   * Returns the parsed header structure of the WARC record.
144   */
145  public Header getHeader() {
146    return header;
147  }
148
149  /**
150   * Returns the body of the record, as an unparsed raw array of bytes. The content of the body
151   * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of
152   * a `response` type header, the body consists of the full HTTP response returned by the server
153   * (HTTP headers followed by the body).
154   */
155  public byte[] getContent() {
156    return content;
157  }
158
159  /**
160   * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not
161   * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same
162   * meaning and should not lose any information.
163   * @param out The output stream to which this record should be appended.
164   */
165  public void write(DataOutput out) throws IOException {
166    header.write(out);
167    out.write(CRLF_BYTES);
168    out.write(content);
169    out.write(CRLF_BYTES);
170    out.write(CRLF_BYTES);
171  }
172
173  /**
174   * Returns a human-readable string representation of the record.
175   */
176  @Override
177  public String toString() {
178    return header.toString();
179  }
180
181  /**
182   * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers
183   * in key-value format, where some header keys are standardised, but nonstandard ones can be
184   * added.
185   * <p/>
186   * The documentation of the methods in this class is excerpted from the
187   * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">WARC 1.0
188   * specification</a>. Please see the specification for more detail.
189   */
190  public final static class Header {
191    private final Map<String, String> fields;
192
193    private Header(Map<String, String> fields) {
194      this.fields = fields;
195    }
196
197    /**
198     * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines
199     * the following record types: (for full definitions, see the
200     * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">spec</a>.
201     * <ul>
202     * <li>`warcinfo`: Describes the records that follow it, up through end of file, end of input,
203     * or until next `warcinfo` record. Typically, this appears once and at the beginning of a WARC
204     * file. For a web archive, it often contains information about the web crawl which generated
205     * the following records.
206     * <p/>
207     * The format of this descriptive record block may vary, though the use of the
208     * `"application/warc-fields"` content-type is recommended. (...)</li>
209     * <li>`response`: The record should contain a complete scheme-specific response, including
210     * network protocol information where possible. For a target-URI of the `http` or `https`
211     * schemes, a `response` record block should contain the full HTTP response received over the
212     * network, including headers. That is, it contains the 'Response' message defined by section 6
213     * of HTTP/1.1 (RFC2616).
214     * <p/>
215     * The WARC record's Content-Type field should contain the value defined by HTTP/1.1,
216     * `"application/http;msgtype=response"`. The payload of the record is defined as its
217     * 'entity-body' (per RFC2616), with any transfer-encoding removed.</li>
218     * <li>`resource`: The record contains a resource, without full protocol response information.
219     * For example: a file directly retrieved from a locally accessible repository or the result of
220     * a networked retrieval where the protocol information has been discarded. For a target-URI of
221     * the `http` or `https` schemes, a `resource` record block shall contain the returned
222     * 'entity-body' (per RFC2616, with any transfer-encodings removed), possibly truncated.</li>
223     * <li>`request`: The record holds the details of a complete scheme-specific request, including
224     * network protocol information where possible. For a target-URI of the `http` or `https`
225     * schemes, a `request` record block should contain the full HTTP request sent over the network,
226     * including headers. That is, it contains the 'Request' message defined by section 5 of
227     * HTTP/1.1 (RFC2616).
228     * <p/>
229     * The WARC record's Content-Type field should contain the value defined by HTTP/1.1,
230     * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of
231     * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any
232     * transfer-encoding removed.</li>
233     * <li>`metadata`: The record contains content created in order to further describe, explain, or
234     * accompany a harvested resource, in ways not covered by other record types. A `metadata`
235     * record will almost always refer to another record of another type, with that other record
236     * holding original harvested or transformed content.
237     * <p/>
238     * The format of the metadata record block may vary. The `"application/warc-fields"` format may
239     * be used.</li>
240     * <li>`revisit`: The record describes the revisitation of content already archived, and might
241     * include only an abbreviated content body which has to be interpreted relative to a previous
242     * record. Most typically, a `revisit` record is used instead of a `response` or `resource`
243     * record to indicate that the content visited was either a complete or substantial duplicate of
244     * material previously archived.
245     * <p/>
246     * A `revisit` record shall contain a WARC-Profile field which determines the interpretation of
247     * the record's fields and record block. Please see the specification for details.</li>
248     * <li>`conversion`: The record shall contain an alternative version of another record's content
249     * that was created as the result of an archival process. Typically, this is used to hold
250     * content transformations that maintain viability of content after widely available rendering
251     * tools for the originally stored format disappear. As needed, the original content may be
252     * migrated (transformed) to a more viable format in order to keep the information usable with
253     * current tools while minimizing loss of information.</li>
254     * <li>`continuation`: Record blocks from `continuation` records must be appended to
255     * corresponding prior record blocks (eg. from other WARC files) to create the logically
256     * complete full-sized original record. That is, `continuation` records are used when a record
257     * that would otherwise cause a WARC file size to exceed a desired limit is broken into
258     * segments. A continuation record shall contain the named fields `WARC-Segment-Origin-ID` and
259     * `WARC-Segment-Number`, and the last `continuation` record of a series shall contain a
260     * `WARC-Segment-Total-Length` field. Please see the specification for details.</li>
261     * <li>Other record types may be added in future, so this list is not exclusive.</li>
262     * </ul>
263     * @return The record's `WARC-Type` header field, as a string.
264     */
265    public String getRecordType() {
266      return fields.get("WARC-Type");
267    }
268
269    /**
270     * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C
271     * profile of ISO8601. The timestamp shall represent the instant that data capture for record
272     * creation began. Multiple records written as part of a single capture event shall use the same
273     * WARC-Date, even though the times of their writing will not be exactly synchronized.
274     * @return The record's `WARC-Date` header field, as a string.
275     */
276    public String getDateString() {
277      return fields.get("WARC-Date");
278    }
279
280    /**
281     * An identifier assigned to the current record that is globally unique for its period of
282     * intended use. No identifier scheme is mandated by this specification, but each record-id
283     * shall be a legal URI and clearly indicate a documented and registered scheme to which it
284     * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`).
285     * @return The record's `WARC-Record-ID` header field, as a string.
286     */
287    public String getRecordID() {
288      return fields.get("WARC-Record-ID");
289    }
290
291    /**
292     * The MIME type (RFC2045) of the information contained in the record's block. For example, in
293     * HTTP request and response records, this would be `application/http` as per section 19.1 of
294     * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response`
295     * respectively).
296     * <p/>
297     * In particular, the content-type is *not* the value of the HTTP Content-Type header in an HTTP
298     * response, but a MIME type to describe the full archived HTTP message (hence
299     * `application/http` if the block contains request or response headers).
300     * @return The record's `Content-Type` header field, as a string.
301     */
302    public String getContentType() {
303      return fields.get("Content-Type");
304    }
305
306    /**
307     * The original URI whose capture gave rise to the information content in this record. In the
308     * context of web harvesting, this is the URI that was the target of a crawler's retrieval
309     * request. For a `revisit` record, it is the URI that was the target of a retrieval request.
310     * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the
311     * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The
312     * URI in this value shall be properly escaped according to RFC3986, and written with no
313     * internal whitespace.
314     * @return The record's `WARC-Target-URI` header field, as a string.
315     */
316    public String getTargetURI() {
317      return fields.get("WARC-Target-URI");
318    }
319
320    /**
321     * The number of bytes in the body of the record, similar to RFC2616.
322     * @return The record's `Content-Length` header field, parsed into an int.
323     */
324    public int getContentLength() {
325      String lengthStr = fields.get("Content-Length");
326      if (lengthStr == null) {
327        throw new IllegalStateException("Missing Content-Length header");
328      }
329      try {
330        return Integer.parseInt(lengthStr);
331      } catch (NumberFormatException e) {
332        throw new IllegalStateException("Malformed Content-Length header: " + lengthStr);
333      }
334    }
335
336    /**
337     * Returns the value of a selected header field, or null if there is no header with that field
338     * name.
339     * @param field The name of the header to return (case-sensitive).
340     * @return The value associated with that field name, or null if not present.
341     */
342    public String getField(String field) {
343      return fields.get(field);
344    }
345
346    /**
347     * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format.
348     * @param out The data output to which the header should be written.
349     */
350    public void write(DataOutput out) throws IOException {
351      out.write(toString().getBytes("UTF-8"));
352    }
353
354    /**
355     * Formats this header in WARC/1.0 format, consisting of a version line followed by
356     * colon-delimited key-value pairs, and `\r\n` line endings.
357     */
358    @Override
359    public String toString() {
360      StringBuilder buf = new StringBuilder();
361      buf.append(WARC_VERSION);
362      buf.append(CRLF);
363      for (Map.Entry<String, String> field : fields.entrySet()) {
364        buf.append(field.getKey());
365        buf.append(": ");
366        buf.append(field.getValue());
367        buf.append(CRLF);
368      }
369      return buf.toString();
370    }
371  }
372
373}