Skip to content
This repository was archived by the owner on Nov 21, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions src/test/java/org/archive/io/arc/ARCReaderFactoryTest.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
package org.archive.io.arc;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.io.*;
import java.net.URL;
import java.util.Iterator;
import java.util.List;

import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
Expand All @@ -21,6 +20,9 @@
public class ARCReaderFactoryTest extends TestCase {

private File testfile1 = new File("src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc");
//private File testfile_nl = new File("src/test/resources/org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc");
private File testfile_nl = getResource(
"org/archive/format/arc/137542-153-20111129020925-00316-kb-prod-har-003.kb.dk_truncated.arc");

/**
* Test reading uncompressed arcfile for issue
Expand Down Expand Up @@ -53,5 +55,44 @@ private void offsetResourceTest( File testfile, long offset, String uri ) throws
if( raf != null )
raf.close();
}


public void testBaseSampleARC() throws IOException {
testIteration(testfile1);
}
// Independent of the ARCReader code
public void testBaseSampleIntegrity() throws IOException {
List<String> urls = ARCTestHelper.getURLs(testfile1);
assertEquals("The correct number of URLs should be extracted", 8, urls.size());
}

// Independent of the ARCReader code
public void testVerifyNewlinedSampleIntegrity() throws IOException {
List<String> urls = ARCTestHelper.getURLs(testfile_nl);
assertEquals("The correct number of URLs should be extracted", 3, urls.size());
}

/*
This fails, but the independent {@link ARCTestHelper} is able to process it.
Logically one of the implementations is faulty.
*/
public void testNewlinedSampleARC() throws IOException {
testIteration(testfile_nl);
}

private void testIteration(File arc) throws IOException {
ARCReader reader = ARCReaderFactory.get(arc);
Iterator<ArchiveRecord> ir = reader.iterator();
while (ir.hasNext()) {
System.out.println(ir.next().getHeader().getHeaderValue("subject-uri"));
}
reader.close();
}

private static File getResource(String resource) {
URL url = Thread.currentThread().getContextClassLoader().getResource(resource);
if (url == null) {
throw new RuntimeException("The resource '" + resource + "' could not be located in the class path");
}
return new File(url.getFile());
}
}
136 changes: 136 additions & 0 deletions src/test/java/org/archive/io/arc/ARCTestHelper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.archive.io.arc;

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Lists the URLs from an ARC file.
*/
public class ARCTestHelper {

public static List<String> getURLs(File arc) throws IOException {
List<String> urls = new ArrayList<String>();
if (!arc.exists()) {
throw new IOException("The file '" + arc + "' does not exist");
}
LineInputStream in = new LineInputStream(arc);

String line;
long oldOffset = 0;

// Skip the ARC header
majorheader:
while ((line = in.readLine()) != null) {
if (!line.contains("</arcmetadata>")) {
continue;
}
while ((line = in.readLine()) != null) {
if (!line.isEmpty()) {
break majorheader;
}
}
}
if (line == null) {
// No recognized records
return urls;
}

final Pattern URL_EXTRACT = Pattern.compile("^(.+) [0-9]{14} .*");
// Iterate the records
while (line != null) {
//System.out.println(line + " (absolute offset: " + oldOffset + ")");
Matcher matcher = URL_EXTRACT.matcher(line);
if (!matcher.find()) {
throw new IllegalArgumentException("Unable to extract URL from '" + line + "'");
}
urls.add(matcher.group());
final long delta = getDelta(line);
if (in.skip(delta) != delta) {
System.err.println("Could not skip " + delta + " bytes");
}
// Skip the newline after content
if (in.read() == -1) {
break;
}
oldOffset = in.getOffset();
line = in.readLine();
//noinspection StatementWithEmptyBody
//while ((line = in.readLine()) != null && line.isEmpty());
}
in.close();
return urls;
}

public static class LineInputStream extends FileInputStream {
private long offset = 0;
public LineInputStream(File file) throws FileNotFoundException {
super(file);
}
public String readLine() throws IOException {
ByteArrayOutputStream by = new ByteArrayOutputStream();
int b;
while ((b = read()) != '\n' && b != -1) {
by.write(b);
}
return by.size() == 0 && b == -1 ? null : by.toString("utf-8");
}
public long getOffset() {
return offset;
}

@Override
public int read() throws IOException {
offset++;
return super.read();
}

@Override
public int read(byte[] b) throws IOException {
int read = super.read(b);
offset += read;
return read;
}

@Override
public int read(byte[] b, int off, int len) throws IOException {
int read = super.read(b, off, len);
offset += read;
return read;
}

@Override
public long skip(long n) throws IOException {
long read = super.skip(n);
offset += read;
return read;
}
}

/// http://www.example.com/somepath 192.168.10.12 20111129020924 text/html 79022
private static long getDelta(String line) {
String tokens[] = line.split(" ");
try {
return Long.parseLong(tokens[tokens.length-1]);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Unable to extract delta from line\n" + line);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
filedesc://137542-153-20111129020925-00316-kb-prod-har-003.kb.dk.arc.open 0.0.0.0 20111129020925 text/plain 1287
1 1 InternetArchive
URL IP-address Archive-date Content-type Archive-length
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd">
<arc:software>Heritrix 1.14.4 http://crawler.archive.org</arc:software>
<arc:hostname>kb-prod-har-003.kb.dk</arc:hostname>
<arc:ip>130.226.228.74</arc:ip>
<dcterms:isPartOf>default_orderxml</dcterms:isPartOf>
<dc:description>Default Profile</dc:description>
<arc:operator>Admin</arc:operator>
<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2008-01-18T11:12:17+00:00</ns0:date>
<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.12.1b +http://netarkivet.dk/website/info.html)</arc:http-header-user-agent>
<arc:http-header-from>[email protected]</arc:http-header-from>
<arc:robots>ignore</arc:robots>
<dc:format>ARC file version 1.1</dc:format>
<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo>
</arcmetadata>

http://www.deerhunter.dk////Default.aspx?ID=361&ProductComp=2634 80.63.58.81 20111129020924 text/html 548
HTTP/1.1 200 OK
Connection: close
Date: Tue, 29 Nov 2011 02:09:25 GMT
Server: Microsoft-IIS/6.0
X-Powered-By: ASP.NET
X-AspNet-Version: 2.0.50727
Cache-Control: private
Content-Type: text/html; charset=utf-8
Content-Length: 78781

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="da">
<head>
[truncated by hand]

</body>

</html>
<!-- Exe time: 0,265 <Deerhunter_TopDropdownNavigation2007.html (1749) > <PageID (361)> <Master/HTML401TransStandard2007.html> -->
http://www.def.dk/sitecore/service/notfound.aspx?item=%2farbejdsforhold%2farbejdsmiljoe%2fsitecore%2fservice%2fnotfound&user=extranet%5cAnonymous&site=website 217.145.53.21 20111129021529 text/html 703
HTTP/1.1 404 Item not found: /arbejdsforhold/arbejdsmiljoe/sitecore/service/notfound
Connection: close
Date: Tue, 29 Nov 2011 02:15:29 GMT
Server: Microsoft-IIS/6.0
X-Powered-By: ASP.NET; Sitecore CMS
X-Powered-By: ASP.NET
X-AspNet-Version: 2.0.50727
Cache-Control: no-cache, no-store
Pragma: no-cache
Expires: -1
Content-Type: text/html; charset=utf-8
Content-Length: 4802


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" >
<head>
<title>Document Not Found</title>
[truncated by hand]
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>
http://www.dccenergi.dk/privat/fyringsolie/bestil-olie/prev/privat/fyringsolie/privat/node/privat/fyringsolie/privat/privat/fyringsolie/automatisk-olielevering 195.225.91.18 20111129021529 text/html 721
HTTP/1.1 200 OK
Date: Tue, 29 Nov 2011 02:15:29 GMT
Server: Apache/2.2.3 (Red Hat) mod_ssl/2.2.3 OpenSSL/0.9.8e-fips-rhel5 DAV/2 PHP/5.2.17
X-Powered-By: PHP/5.2.17
Expires: Sun, 19 Nov 1978 05:00:00 GMT
Last-Modified: Tue, 29 Nov 2011 02:15:29 GMT
Cache-Control: store, no-cache, must-revalidate
Cache-Control: post-check=0, pre-check=0
Connection: close
Content-Type: text/html; charset=utf-8

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="da" lang="da" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
[truncated by hand]
</script>
</body>
</html>