This repository has been archived on 2020-05-27. You can view files and clone it, but cannot push or open issues/pull-requests.
weblech/weblech/spider/URLGetter.java

139 lines
5.1 KiB
Java
Executable File

/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.Authenticator;
import java.io.*;
import weblech.ui.LechLogger;
public class URLGetter
{
private int failureCount = 0;
private final SpiderConfig config;
public URLGetter(SpiderConfig config)
{
LechLogger.debug("URLGetter()");
this.config = config;
Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
}
public URLObject getURL(URLToDownload url)
{
LechLogger.debug("getURL(" + url + ")");
if(failureCount > 10)
{
LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download");
try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { };
failureCount = 0;
}
URL requestedURL = url.getURL();
URL referer = url.getReferer();
try
{
LechLogger.debug("Creating HTTP connection to " + requestedURL);
HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection();
if(referer != null)
{
LechLogger.debug("Setting Referer header to " + referer);
conn.setRequestProperty("Referer", referer.toExternalForm());
}
if(config.getUserAgent() != null)
{
LechLogger.debug("Setting User-Agent to " + config.getUserAgent());
conn.setRequestProperty("User-Agent", config.getUserAgent());
}
conn.setUseCaches(false);
LechLogger.debug("Opening URL");
long startTime = System.currentTimeMillis();
conn.connect();
String resp = conn.getResponseMessage();
LechLogger.debug("Remote server response: " + resp);
String respStr = conn.getHeaderField(0);
LechLogger.info("Server response: " + respStr);
for(int i = 1; ; i++)
{
String key = conn.getHeaderFieldKey(i);
if(key == null)
{
break;
}
String value = conn.getHeaderField(key);
LechLogger.debug("Received header " + key + ": " + value);
}
LechLogger.debug("Getting buffered input stream from remote connection");
BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
byte[] buf = new byte[1024];
int bytesRead = 0;
while(bytesRead >= 0)
{
baos.write(buf, 0, bytesRead);
bytesRead = remoteBIS.read(buf);
}
byte[] content = baos.toByteArray();
long timeTaken = System.currentTimeMillis() - startTime;
if(timeTaken < 100) timeTaken = 500;
int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
if(content.length < conn.getContentLength())
{
LechLogger.warn("Didn't download full content for URL: " + url);
failureCount++;
return null;
}
return new URLObject(requestedURL, conn.getContentType(), content, config);
}
catch(FileNotFoundException fnfe) {
LechLogger.warn("File not found: " + fnfe.getMessage());
return null;
}
catch(IOException ioe)
{
LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
failureCount++;
return null;
}
}
}