public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat
| Modifier and Type | Field and Description |
|---|---|
static String |
MAX_WARC_FILE_SIZE |
static String |
TEMPLATE |
conf, content, inLinks, jsonArray, keyPrefix, LOG, metadata, reverseKey, reverseKeyValue, simpleDateFormat, url| Constructor and Description |
|---|
CommonCrawlFormatWARC(Configuration nutchConf,
CommonCrawlConfig config) |
CommonCrawlFormatWARC(String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config,
ParseData parseData) |
| Modifier and Type | Method and Description |
|---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected void |
closeArray(String key,
boolean nested,
boolean newline) |
protected void |
closeObject(String key) |
protected String |
generateJson() |
String |
getJsonData() |
String |
getJsonData(String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected void |
startArray(String key,
boolean nested,
boolean newline) |
protected void |
startObject(String key) |
protected void |
writeArrayValue(String value) |
protected void |
writeKeyNull(String key) |
protected void |
writeKeyValue(String key,
String value) |
protected URI |
writeRequest(URI id) |
protected URI |
writeResponse() |
getImported, getInLinks, getJsonData, getKey, getMethod, getRequestAccept, getRequestAcceptEncoding, getRequestAcceptLanguage, getRequestContactEmail, getRequestContactName, getRequestHostAddress, getRequestHostName, getRequestRobots, getRequestSoftware, getRequestUserAgent, getResponseAddress, getResponseContent, getResponseContentEncoding, getResponseContentType, getResponseDate, getResponseHostName, getResponseServer, getResponseStatus, getTimestamp, getUrl, setInLinkspublic static final String MAX_WARC_FILE_SIZE
public static final String TEMPLATE
public CommonCrawlFormatWARC(Configuration nutchConf, CommonCrawlConfig config) throws IOException
IOExceptionpublic CommonCrawlFormatWARC(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config, ParseData parseData) throws IOException
IOExceptionpublic String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException
CommonCrawlFormatgetJsonData in interface CommonCrawlFormatgetJsonData in class AbstractCommonCrawlFormatIOExceptionpublic String getJsonData() throws IOException
getJsonData in interface CommonCrawlFormatgetJsonData in class AbstractCommonCrawlFormatIOExceptionprotected URI writeResponse() throws IOException, ParseException
IOExceptionParseExceptionprotected URI writeRequest(URI id) throws IOException, ParseException
IOExceptionParseExceptionprotected String generateJson() throws IOException
generateJson in class AbstractCommonCrawlFormatIOExceptionprotected void writeKeyValue(String key, String value) throws IOException
writeKeyValue in class AbstractCommonCrawlFormatIOExceptionprotected void writeKeyNull(String key) throws IOException
writeKeyNull in class AbstractCommonCrawlFormatIOExceptionprotected void startArray(String key, boolean nested, boolean newline) throws IOException
startArray in class AbstractCommonCrawlFormatIOExceptionprotected void closeArray(String key, boolean nested, boolean newline) throws IOException
closeArray in class AbstractCommonCrawlFormatIOExceptionprotected void writeArrayValue(String value) throws IOException
writeArrayValue in class AbstractCommonCrawlFormatIOExceptionprotected void startObject(String key) throws IOException
startObject in class AbstractCommonCrawlFormatIOExceptionprotected void closeObject(String key) throws IOException
closeObject in class AbstractCommonCrawlFormatIOExceptionpublic void close()
CommonCrawlFormatclose in interface Closeableclose in interface AutoCloseableclose in interface CommonCrawlFormatclose in class AbstractCommonCrawlFormatCopyright © 2021 The Apache Software Foundation