public abstract class AbstractCommonCrawlFormat extends Object implements CommonCrawlFormat
| Modifier and Type | Field and Description |
|---|---|
protected Configuration |
conf |
protected Content |
content |
protected List<String> |
inLinks |
protected boolean |
jsonArray |
protected String |
keyPrefix |
protected static org.slf4j.Logger |
LOG |
protected Metadata |
metadata |
protected boolean |
reverseKey |
protected String |
reverseKeyValue |
protected boolean |
simpleDateFormat |
protected String |
url |
| Constructor and Description |
|---|
AbstractCommonCrawlFormat(String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config) |
| Modifier and Type | Method and Description |
|---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected abstract void |
closeArray(String key,
boolean nested,
boolean newline) |
protected abstract void |
closeObject(String key) |
protected abstract String |
generateJson() |
protected String |
getImported() |
List<String> |
getInLinks()
gets set of inlinks
|
String |
getJsonData() |
String |
getJsonData(String url,
Content content,
Metadata metadata)
Returns a string representation of the JSON structure of the URL content
|
String |
getJsonData(String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected String |
getKey() |
protected String |
getMethod() |
protected String |
getRequestAccept() |
protected String |
getRequestAcceptEncoding() |
protected String |
getRequestAcceptLanguage() |
protected String |
getRequestContactEmail() |
protected String |
getRequestContactName() |
protected String |
getRequestHostAddress() |
protected String |
getRequestHostName() |
protected String |
getRequestRobots() |
protected String |
getRequestSoftware() |
protected String |
getRequestUserAgent() |
protected String |
getResponseAddress() |
protected String |
getResponseContent() |
protected String |
getResponseContentEncoding() |
protected String |
getResponseContentType() |
protected String |
getResponseDate() |
protected String |
getResponseHostName() |
protected String |
getResponseServer() |
protected String |
getResponseStatus() |
protected String |
getTimestamp() |
protected String |
getUrl() |
void |
setInLinks(List<String> inLinks)
sets inlinks of this document
|
protected abstract void |
startArray(String key,
boolean nested,
boolean newline) |
protected abstract void |
startObject(String key) |
protected abstract void |
writeArrayValue(String value) |
protected abstract void |
writeKeyNull(String key) |
protected abstract void |
writeKeyValue(String key,
String value) |
protected static final org.slf4j.Logger LOG
protected String url
protected Content content
protected Metadata metadata
protected Configuration conf
protected String keyPrefix
protected boolean simpleDateFormat
protected boolean jsonArray
protected boolean reverseKey
protected String reverseKeyValue
public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
IOExceptionpublic String getJsonData(String url, Content content, Metadata metadata) throws IOException
CommonCrawlFormatgetJsonData in interface CommonCrawlFormatIOExceptionpublic String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException
CommonCrawlFormatgetJsonData in interface CommonCrawlFormatIOExceptionpublic String getJsonData() throws IOException
getJsonData in interface CommonCrawlFormatIOExceptionprotected abstract void writeKeyValue(String key, String value) throws IOException
IOExceptionprotected abstract void writeKeyNull(String key) throws IOException
IOExceptionprotected abstract void startArray(String key, boolean nested, boolean newline) throws IOException
IOExceptionprotected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException
IOExceptionprotected abstract void writeArrayValue(String value) throws IOException
IOExceptionprotected abstract void startObject(String key) throws IOException
IOExceptionprotected abstract void closeObject(String key) throws IOException
IOExceptionprotected abstract String generateJson() throws IOException
IOExceptionprotected String getUrl()
protected String getTimestamp()
protected String getMethod()
protected String getRequestHostName()
protected String getRequestHostAddress()
protected String getRequestSoftware()
protected String getRequestRobots()
protected String getRequestContactName()
protected String getRequestContactEmail()
protected String getRequestAccept()
protected String getRequestAcceptEncoding()
protected String getRequestAcceptLanguage()
protected String getRequestUserAgent()
protected String getResponseStatus()
protected String getResponseHostName()
protected String getResponseAddress()
protected String getResponseContentEncoding()
protected String getResponseContentType()
public List<String> getInLinks()
CommonCrawlFormatgetInLinks in interface CommonCrawlFormatpublic void setInLinks(List<String> inLinks)
CommonCrawlFormatsetInLinks in interface CommonCrawlFormatinLinks - list of inlinksprotected String getResponseDate()
protected String getResponseServer()
protected String getResponseContent()
protected String getKey()
protected String getImported()
public void close()
CommonCrawlFormatclose in interface Closeableclose in interface AutoCloseableclose in interface CommonCrawlFormatCopyright © 2021 The Apache Software Foundation