139 lines
5.1 KiB
Java
Executable File
139 lines
5.1 KiB
Java
Executable File
/*
|
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
|
*
|
|
* Copyright (c) 2001 Brian Pitcher
|
|
* Copyright (c) 2004 Andrew Coleman
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
package weblech.spider;
|
|
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.net.Authenticator;
|
|
import java.io.*;
|
|
|
|
import weblech.ui.LechLogger;
|
|
|
|
public class URLGetter
|
|
{
|
|
private int failureCount = 0;
|
|
|
|
private final SpiderConfig config;
|
|
|
|
public URLGetter(SpiderConfig config)
|
|
{
|
|
LechLogger.debug("URLGetter()");
|
|
this.config = config;
|
|
|
|
Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
|
|
}
|
|
|
|
public URLObject getURL(URLToDownload url)
|
|
{
|
|
LechLogger.debug("getURL(" + url + ")");
|
|
|
|
if(failureCount > 10)
|
|
{
|
|
LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download");
|
|
try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { };
|
|
failureCount = 0;
|
|
}
|
|
|
|
URL requestedURL = url.getURL();
|
|
URL referer = url.getReferer();
|
|
|
|
try
|
|
{
|
|
LechLogger.debug("Creating HTTP connection to " + requestedURL);
|
|
HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection();
|
|
if(referer != null)
|
|
{
|
|
LechLogger.debug("Setting Referer header to " + referer);
|
|
conn.setRequestProperty("Referer", referer.toExternalForm());
|
|
}
|
|
|
|
if(config.getUserAgent() != null)
|
|
{
|
|
LechLogger.debug("Setting User-Agent to " + config.getUserAgent());
|
|
conn.setRequestProperty("User-Agent", config.getUserAgent());
|
|
}
|
|
|
|
conn.setUseCaches(false);
|
|
|
|
LechLogger.debug("Opening URL");
|
|
long startTime = System.currentTimeMillis();
|
|
conn.connect();
|
|
|
|
String resp = conn.getResponseMessage();
|
|
LechLogger.debug("Remote server response: " + resp);
|
|
|
|
String respStr = conn.getHeaderField(0);
|
|
LechLogger.info("Server response: " + respStr);
|
|
|
|
for(int i = 1; ; i++)
|
|
{
|
|
String key = conn.getHeaderFieldKey(i);
|
|
if(key == null)
|
|
{
|
|
break;
|
|
}
|
|
String value = conn.getHeaderField(key);
|
|
LechLogger.debug("Received header " + key + ": " + value);
|
|
}
|
|
|
|
LechLogger.debug("Getting buffered input stream from remote connection");
|
|
BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
|
|
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
|
|
byte[] buf = new byte[1024];
|
|
int bytesRead = 0;
|
|
while(bytesRead >= 0)
|
|
{
|
|
baos.write(buf, 0, bytesRead);
|
|
bytesRead = remoteBIS.read(buf);
|
|
}
|
|
|
|
byte[] content = baos.toByteArray();
|
|
long timeTaken = System.currentTimeMillis() - startTime;
|
|
if(timeTaken < 100) timeTaken = 500;
|
|
|
|
int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
|
|
LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
|
|
if(content.length < conn.getContentLength())
|
|
{
|
|
LechLogger.warn("Didn't download full content for URL: " + url);
|
|
failureCount++;
|
|
return null;
|
|
}
|
|
return new URLObject(requestedURL, conn.getContentType(), content, config);
|
|
}
|
|
catch(FileNotFoundException fnfe) {
|
|
LechLogger.warn("File not found: " + fnfe.getMessage());
|
|
return null;
|
|
}
|
|
catch(IOException ioe)
|
|
{
|
|
LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
|
|
failureCount++;
|
|
return null;
|
|
}
|
|
}
|
|
}
|