001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.test.util.warc;
019
020import java.io.ByteArrayOutputStream;
021import java.io.DataInput;
022import java.io.DataOutput;
023import java.io.IOException;
024import java.util.LinkedHashMap;
025import java.util.Map;
026import java.util.regex.Pattern;
027
028/**
029 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing
030 * it out of a {@link DataInput} stream. The file format is documented in the [ISO
031 * Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). In a nutshell, it's
032 * a textual format consisting of lines delimited by `\r\n`. Each record has the following
033 * structure: 1. A line indicating the WARC version number, such as `WARC/1.0`. 2. Several header
034 * lines (in key-value format, similar to HTTP or email headers), giving information about the
035 * record. The header is terminated by an empty line. 3. A body consisting of raw bytes (the number
036 * of bytes is indicated in one of the headers). 4. A final separator of `\r\n\r\n` before the next
037 * record starts. There are various different types of records, as documented on
038 * {@link Header#getRecordType()}.
039 */
040public class WARCRecord {
041
042  public static final String WARC_VERSION = "WARC/1.0";
043  private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+");
044  private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*");
045  private static final String CRLF = "\r\n";
046  private static final byte[] CRLF_BYTES = { 13, 10 };
047
048  private final Header header;
049  private final byte[] content;
050
051  /**
052   * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream.
053   * @param in The input source from which one record will be read.
054   */
055  public WARCRecord(DataInput in) throws IOException {
056    header = readHeader(in);
057    content = new byte[header.getContentLength()];
058    in.readFully(content);
059    readSeparator(in);
060  }
061
062  private static Header readHeader(DataInput in) throws IOException {
063    String versionLine = readLine(in);
064    if (!VERSION_PATTERN.matcher(versionLine).matches()) {
065      throw new IllegalStateException("Expected WARC version, but got: " + versionLine);
066    }
067
068    LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>();
069    String line, fieldName = null;
070
071    do {
072      line = readLine(in);
073      if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) {
074        headers.put(fieldName, headers.get(fieldName) + line);
075      } else if (!line.isEmpty()) {
076        String[] field = line.split(":", 2);
077        if (field.length < 2) {
078          throw new IllegalStateException("Malformed header line: " + line);
079        }
080        fieldName = field[0].trim();
081        headers.put(fieldName, field[1].trim());
082      }
083    } while (!line.isEmpty());
084
085    return new Header(headers);
086  }
087
088  private static String readLine(DataInput in) throws IOException {
089    ByteArrayOutputStream out = new ByteArrayOutputStream();
090    boolean seenCR = false, seenCRLF = false;
091    while (!seenCRLF) {
092      byte b = in.readByte();
093      if (!seenCR && b == 13) {
094        seenCR = true;
095      } else if (seenCR && b == 10) {
096        seenCRLF = true;
097      } else {
098        seenCR = false;
099        out.write(b);
100      }
101    }
102    return out.toString("UTF-8");
103  }
104
105  private static void readSeparator(DataInput in) throws IOException {
106    byte[] sep = new byte[4];
107    in.readFully(sep);
108    if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) {
109      throw new IllegalStateException(
110        String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1],
111          sep[2], sep[3]));
112    }
113  }
114
115  /**
116   * Returns the parsed header structure of the WARC record.
117   */
118  public Header getHeader() {
119    return header;
120  }
121
122  /**
123   * Returns the body of the record, as an unparsed raw array of bytes. The content of the body
124   * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of
125   * a `response` type header, the body consists of the full HTTP response returned by the server
126   * (HTTP headers followed by the body).
127   */
128  public byte[] getContent() {
129    return content;
130  }
131
132  /**
133   * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not
134   * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same
135   * meaning and should not lose any information.
136   * @param out The output stream to which this record should be appended.
137   */
138  public void write(DataOutput out) throws IOException {
139    header.write(out);
140    out.write(CRLF_BYTES);
141    out.write(content);
142    out.write(CRLF_BYTES);
143    out.write(CRLF_BYTES);
144  }
145
146  /**
147   * Returns a human-readable string representation of the record.
148   */
149  @Override
150  public String toString() {
151    return header.toString();
152  }
153
154  /**
155   * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers
156   * in key-value format, where some header keys are standardised, but nonstandard ones can be
157   * added. The documentation of the methods in this class is excerpted from the [WARC 1.0
158   * specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). Please see
159   * the specification for more detail.
160   */
161  public final static class Header {
162    private final Map<String, String> fields;
163
164    private Header(Map<String, String> fields) {
165      this.fields = fields;
166    }
167
168    /**
169     * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines
170     * the following record types: (for full definitions, see the
171     * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf)) * `warcinfo`:
172     * Describes the records that follow it, up through end of file, end of input, or until next
173     * `warcinfo` record. Typically, this appears once and at the beginning of a WARC file. For a
174     * web archive, it often contains information about the web crawl which generated the following
175     * records. The format of this descriptive record block may vary, though the use of the
176     * `"application/warc-fields"` content-type is recommended. (...) * `response`: The record
177     * should contain a complete scheme-specific response, including network protocol information
178     * where possible. For a target-URI of the `http` or `https` schemes, a `response` record block
179     * should contain the full HTTP response received over the network, including headers. That is,
180     * it contains the 'Response' message defined by section 6 of HTTP/1.1 (RFC2616). The WARC
181     * record's Content-Type field should contain the value defined by HTTP/1.1,
182     * `"application/http;msgtype=response"`. The payload of the record is defined as its
183     * 'entity-body' (per RFC2616), with any transfer-encoding removed. * `resource`: The record
184     * contains a resource, without full protocol response information. For example: a file directly
185     * retrieved from a locally accessible repository or the result of a networked retrieval where
186     * the protocol information has been discarded. For a target-URI of the `http` or `https`
187     * schemes, a `resource` record block shall contain the returned 'entity-body' (per RFC2616,
188     * with any transfer-encodings removed), possibly truncated. * `request`: The record holds the
189     * details of a complete scheme-specific request, including network protocol information where
190     * possible. For a target-URI of the `http` or `https` schemes, a `request` record block should
191     * contain the full HTTP request sent over the network, including headers. That is, it contains
192     * the 'Request' message defined by section 5 of HTTP/1.1 (RFC2616). The WARC record's
193     * Content-Type field should contain the value defined by HTTP/1.1,
194     * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of
195     * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any
196     * transfer-encoding removed. * `metadata`: The record contains content created in order to
197     * further describe, explain, or accompany a harvested resource, in ways not covered by other
198     * record types. A `metadata` record will almost always refer to another record of another type,
199     * with that other record holding original harvested or transformed content. The format of the
200     * metadata record block may vary. The `"application/warc-fields"` format may be used. *
201     * `revisit`: The record describes the revisitation of content already archived, and might
202     * include only an abbreviated content body which has to be interpreted relative to a previous
203     * record. Most typically, a `revisit` record is used instead of a `response` or `resource`
204     * record to indicate that the content visited was either a complete or substantial duplicate of
205     * material previously archived. A `revisit` record shall contain a WARC-Profile field which
206     * determines the interpretation of the record's fields and record block. Please see the
207     * specification for details. * `conversion`: The record shall contain an alternative version of
208     * another record's content that was created as the result of an archival process. Typically,
209     * this is used to hold content transformations that maintain viability of content after widely
210     * available rendering tools for the originally stored format disappear. As needed, the original
211     * content may be migrated (transformed) to a more viable format in order to keep the
212     * information usable with current tools while minimizing loss of information. * `continuation`:
213     * Record blocks from `continuation` records must be appended to corresponding prior record
214     * blocks (eg. from other WARC files) to create the logically complete full-sized original
215     * record. That is, `continuation` records are used when a record that would otherwise cause a
216     * WARC file size to exceed a desired limit is broken into segments. A continuation record shall
217     * contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`, and the last
218     * `continuation` record of a series shall contain a `WARC-Segment-Total-Length` field. Please
219     * see the specification for details. * Other record types may be added in future, so this list
220     * is not exclusive.
221     * @return The record's `WARC-Type` header field, as a string.
222     */
223    public String getRecordType() {
224      return fields.get("WARC-Type");
225    }
226
227    /**
228     * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C
229     * profile of ISO8601. The timestamp shall represent the instant that data capture for record
230     * creation began. Multiple records written as part of a single capture event shall use the same
231     * WARC-Date, even though the times of their writing will not be exactly synchronized.
232     * @return The record's `WARC-Date` header field, as a string.
233     */
234    public String getDateString() {
235      return fields.get("WARC-Date");
236    }
237
238    /**
239     * An identifier assigned to the current record that is globally unique for its period of
240     * intended use. No identifier scheme is mandated by this specification, but each record-id
241     * shall be a legal URI and clearly indicate a documented and registered scheme to which it
242     * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`).
243     * @return The record's `WARC-Record-ID` header field, as a string.
244     */
245    public String getRecordID() {
246      return fields.get("WARC-Record-ID");
247    }
248
249    /**
250     * The MIME type (RFC2045) of the information contained in the record's block. For example, in
251     * HTTP request and response records, this would be `application/http` as per section 19.1 of
252     * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response`
253     * respectively). In particular, the content-type is *not* the value of the HTTP Content-Type
254     * header in an HTTP response, but a MIME type to describe the full archived HTTP message (hence
255     * `application/http` if the block contains request or response headers).
256     * @return The record's `Content-Type` header field, as a string.
257     */
258    public String getContentType() {
259      return fields.get("Content-Type");
260    }
261
262    /**
263     * The original URI whose capture gave rise to the information content in this record. In the
264     * context of web harvesting, this is the URI that was the target of a crawler's retrieval
265     * request. For a `revisit` record, it is the URI that was the target of a retrieval request.
266     * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the
267     * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The
268     * URI in this value shall be properly escaped according to RFC3986, and written with no
269     * internal whitespace.
270     * @return The record's `WARC-Target-URI` header field, as a string.
271     */
272    public String getTargetURI() {
273      return fields.get("WARC-Target-URI");
274    }
275
276    /**
277     * The number of bytes in the body of the record, similar to RFC2616.
278     * @return The record's `Content-Length` header field, parsed into an int.
279     */
280    public int getContentLength() {
281      String lengthStr = fields.get("Content-Length");
282      if (lengthStr == null) {
283        throw new IllegalStateException("Missing Content-Length header");
284      }
285      try {
286        return Integer.parseInt(lengthStr);
287      } catch (NumberFormatException e) {
288        throw new IllegalStateException("Malformed Content-Length header: " + lengthStr);
289      }
290    }
291
292    /**
293     * Returns the value of a selected header field, or null if there is no header with that field
294     * name.
295     * @param field The name of the header to return (case-sensitive).
296     * @return The value associated with that field name, or null if not present.
297     */
298    public String getField(String field) {
299      return fields.get(field);
300    }
301
302    /**
303     * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format.
304     * @param out The data output to which the header should be written.
305     */
306    public void write(DataOutput out) throws IOException {
307      out.write(toString().getBytes("UTF-8"));
308    }
309
310    /**
311     * Formats this header in WARC/1.0 format, consisting of a version line followed by
312     * colon-delimited key-value pairs, and `\r\n` line endings.
313     */
314    @Override
315    public String toString() {
316      StringBuilder buf = new StringBuilder();
317      buf.append(WARC_VERSION);
318      buf.append(CRLF);
319      for (Map.Entry<String, String> field : fields.entrySet()) {
320        buf.append(field.getKey());
321        buf.append(": ");
322        buf.append(field.getValue());
323        buf.append(CRLF);
324      }
325      return buf.toString();
326    }
327  }
328
329}