Initial revision
commit
80c48012cd
|
@ -0,0 +1,26 @@
|
|||
WebLech license information.
|
||||
============================
|
||||
|
||||
This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
|
||||
Copyright (c) 2001 Brian Pitcher
|
||||
Copyright (c) 2004 Andrew Coleman
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
<project name="WebLech" default="dist" basedir=".">
|
||||
<description>
|
||||
WebLech - a tool for downloading the web
|
||||
</description>
|
||||
|
||||
<property name="src" location="weblech" />
|
||||
<property name="build" location="build" />
|
||||
<property name="dist" location="jars" />
|
||||
|
||||
<target name="init">
|
||||
<tstamp />
|
||||
<mkdir dir="${build}" />
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init">
|
||||
<javac srcdir="${src}" destdir="${build}" />
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile">
|
||||
<mkdir dir="${dist}" />
|
||||
<manifest file="${dist}/MANIFEST.MF">
|
||||
<attribute name="Main-Class" value="weblech.ui.Troll" />
|
||||
</manifest>
|
||||
<jar manifest="${dist}/MANIFEST.MF" jarfile="${dist}/WebLech-${DSTAMP}.jar" basedir="${build}" />
|
||||
</target>
|
||||
|
||||
<target name="clean">
|
||||
<delete dir="${build}" />
|
||||
<delete dir="${dist}" />
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,18 @@
|
|||
/*
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Michael Mason
|
||||
* Date: Jun 5, 2002
|
||||
* Time: 6:43:04 PM
|
||||
* To change template for new interface use
|
||||
* Code Style | Class Templates options (Tools | IDE Options).
|
||||
*/
|
||||
package weblech.spider;
|
||||
|
||||
public interface Constants
|
||||
{
|
||||
|
||||
/** How often to check the queue status */
|
||||
int QUEUE_CHECK_INTERVAL = 500;
|
||||
/** How long to pause for threads to finish before exitting */
|
||||
int SPIDER_STOP_PAUSE = 500;
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/DownloadQueue.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.util.*;
|
||||
import java.net.URL;
|
||||
import java.io.Serializable;
|
||||
|
||||
public class DownloadQueue implements Serializable
|
||||
{
|
||||
private SpiderConfig config;
|
||||
|
||||
private List interestingURLsToDownload;
|
||||
private List averageURLsToDownload;
|
||||
private List boringURLsToDownload;
|
||||
private Set urlsInQueue;
|
||||
|
||||
public DownloadQueue(SpiderConfig config)
|
||||
{
|
||||
this.config = config;
|
||||
interestingURLsToDownload = new ArrayList();
|
||||
averageURLsToDownload = new ArrayList();
|
||||
boringURLsToDownload = new ArrayList();
|
||||
urlsInQueue = new HashSet();
|
||||
}
|
||||
|
||||
public void queueURL(URLToDownload url)
|
||||
{
|
||||
URL u = url.getURL();
|
||||
if(urlsInQueue.contains(u))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(config.isInteresting(u))
|
||||
{
|
||||
if(config.isDepthFirstSearch())
|
||||
{
|
||||
interestingURLsToDownload.add(0, url);
|
||||
}
|
||||
else
|
||||
{
|
||||
interestingURLsToDownload.add(url);
|
||||
}
|
||||
}
|
||||
else if(config.isBoring(u))
|
||||
{
|
||||
if(config.isDepthFirstSearch())
|
||||
{
|
||||
boringURLsToDownload.add(0, url);
|
||||
}
|
||||
else
|
||||
{
|
||||
boringURLsToDownload.add(url);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(config.isDepthFirstSearch())
|
||||
{
|
||||
averageURLsToDownload.add(0, url);
|
||||
}
|
||||
else
|
||||
{
|
||||
averageURLsToDownload.add(url);
|
||||
}
|
||||
}
|
||||
|
||||
urlsInQueue.add(u);
|
||||
}
|
||||
|
||||
public void queueURLs(Collection urls)
|
||||
{
|
||||
for(Iterator i = urls.iterator(); i.hasNext(); )
|
||||
{
|
||||
URLToDownload u2d = (URLToDownload) i.next();
|
||||
queueURL(u2d);
|
||||
}
|
||||
}
|
||||
|
||||
public URLToDownload getNextInQueue()
|
||||
{
|
||||
if(interestingURLsToDownload.size() > 0)
|
||||
{
|
||||
return returnURLFrom(interestingURLsToDownload);
|
||||
}
|
||||
else if(averageURLsToDownload.size() > 0)
|
||||
{
|
||||
return returnURLFrom(averageURLsToDownload);
|
||||
}
|
||||
else if(boringURLsToDownload.size() > 0)
|
||||
{
|
||||
return returnURLFrom(boringURLsToDownload);
|
||||
}
|
||||
else
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private URLToDownload returnURLFrom(List urlList)
|
||||
{
|
||||
URLToDownload u2d = (URLToDownload) urlList.get(0);
|
||||
urlList.remove(0);
|
||||
urlsInQueue.remove(u2d.getURL());
|
||||
return u2d;
|
||||
}
|
||||
|
||||
public int size()
|
||||
{
|
||||
return interestingURLsToDownload.size() + averageURLsToDownload.size() + boringURLsToDownload.size();
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
return size() + " URLs";
|
||||
}
|
||||
|
||||
} // End class DownloadQueue
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.net.Authenticator;
|
||||
import java.net.PasswordAuthentication;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
public class DumbAuthenticator extends Authenticator
|
||||
{
|
||||
private final String user;
|
||||
private final String password;
|
||||
|
||||
public DumbAuthenticator(String user, String password)
|
||||
{
|
||||
LechLogger.debug("DumbAuthenticator(" + user + ", ***)");
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public PasswordAuthentication getPasswordAuthentication()
|
||||
{
|
||||
LechLogger.debug("getPasswordAuthentication()");
|
||||
return new PasswordAuthentication(user, password.toCharArray());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.net.URL;
|
||||
import java.net.MalformedURLException;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.FileWriter;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
public class HTMLParser
|
||||
{
|
||||
private SpiderConfig config;
|
||||
|
||||
public HTMLParser(SpiderConfig config)
|
||||
{
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public List parseLinksInDocument(URL sourceURL, String textContent)
|
||||
{
|
||||
return parseAsHTML(sourceURL, textContent);
|
||||
}
|
||||
|
||||
private List parseAsHTML(URL sourceURL, String textContent)
|
||||
{
|
||||
LechLogger.debug("parseAsHTML()");
|
||||
ArrayList newURLs = new ArrayList();
|
||||
HashSet newURLSet = new HashSet();
|
||||
|
||||
/* note from coleman:
|
||||
* I had to add a few tags into this, namely the link and embeds. weblech should download flash
|
||||
* movies, mpegs, avis, and anything else that it finds on the page. even stylesheets :)
|
||||
*/
|
||||
extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("link", "href", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("embed", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("LINK", "HREF", sourceURL, newURLs, newURLSet, textContent);
|
||||
extractAttributesFromTags("EMBED", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||
|
||||
if(newURLs.size() == 0)
|
||||
{
|
||||
LechLogger.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
|
||||
}
|
||||
LechLogger.debug("Returning " + newURLs.size() + " urls extracted from page");
|
||||
return newURLs;
|
||||
}
|
||||
|
||||
private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input)
|
||||
{
|
||||
LechLogger.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");
|
||||
|
||||
int startPos = 0;
|
||||
String startTag = "<" + tag + " ";
|
||||
String attrStr = attr + "=\"";
|
||||
while(true)
|
||||
{
|
||||
int tagPos = input.indexOf(startTag, startPos);
|
||||
if(tagPos < 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
int attrPos = input.indexOf(attrStr, tagPos + 1);
|
||||
if(attrPos < 0)
|
||||
{
|
||||
startPos = tagPos + 1;
|
||||
continue;
|
||||
}
|
||||
int nextClosePos = input.indexOf(">", tagPos + 1);
|
||||
if(attrPos < nextClosePos)
|
||||
{
|
||||
// Ooh, found one
|
||||
int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);
|
||||
if(closeQuotePos > 0)
|
||||
{
|
||||
String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);
|
||||
if(urlStr.indexOf('#') != -1)
|
||||
{
|
||||
urlStr = urlStr.substring(0, urlStr.indexOf('#'));
|
||||
}
|
||||
//LechLogger.debug("Found possible URL string: " + URL);
|
||||
|
||||
if(isMailTo(urlStr))
|
||||
{
|
||||
logMailURL(urlStr);
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
|
||||
URL u = new URL(sourceURL, urlStr);
|
||||
if(newURLSet.contains(u))
|
||||
{
|
||||
//LechLogger.debug("Already found URL on page: " + u);
|
||||
}
|
||||
else
|
||||
{
|
||||
newURLs.add(u);
|
||||
newURLSet.add(u);
|
||||
//LechLogger.debug("Found new URL on page: " + u);
|
||||
}
|
||||
}
|
||||
catch(MalformedURLException murle)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
startPos = tagPos + 1;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
startPos = tagPos + 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void logMailURL(String url)
|
||||
{
|
||||
LechLogger.debug("logMailURL()");
|
||||
|
||||
try
|
||||
{
|
||||
FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);
|
||||
PrintWriter pW = new PrintWriter(appendedFile);
|
||||
pW.println(url);
|
||||
pW.flush();
|
||||
pW.close();
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
LechLogger.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a particular URL looks like it's a mailto: style link.
|
||||
*/
|
||||
private boolean isMailTo(String url)
|
||||
{
|
||||
if(url == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
url = url.toUpperCase();
|
||||
return (url.indexOf("MAILTO:") != -1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
|
||||
public class Spider implements Runnable, Constants
|
||||
{
|
||||
/** Config for the spider */
|
||||
private SpiderConfig config;
|
||||
/**
|
||||
* Download queue.
|
||||
* Thread safety: To access the queue, first synchronize on it.
|
||||
*/
|
||||
private DownloadQueue queue;
|
||||
/**
|
||||
* Set of URLs downloaded or scheduled, so we don't download a
|
||||
* URL more than once.
|
||||
* Thread safety: To access the set, first synchronize on it.
|
||||
*/
|
||||
private Set urlsDownloadedOrScheduled;
|
||||
/**
|
||||
* Set of URLs currently being downloaded by Spider threads.
|
||||
* Thread safety: To access the set, first synchronize on it.
|
||||
*/
|
||||
private Set urlsDownloading;
|
||||
/**
|
||||
* Number of downloads currently taking place.
|
||||
* Thread safety: To modify this value, first synchronize on
|
||||
* the download queue.
|
||||
*/
|
||||
private int downloadsInProgress;
|
||||
/** Whether the spider should quit */
|
||||
private boolean quit;
|
||||
/** Count of running Spider threads. */
|
||||
private int running;
|
||||
/** Time we last checkpointed. */
|
||||
private long lastCheckpoint;
|
||||
|
||||
public Spider(SpiderConfig config)
|
||||
{
|
||||
this.config = config;
|
||||
queue = new DownloadQueue(config);
|
||||
queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
|
||||
urlsDownloadedOrScheduled = new HashSet();
|
||||
urlsDownloading = new HashSet();
|
||||
downloadsInProgress = 0;
|
||||
lastCheckpoint = 0;
|
||||
}
|
||||
|
||||
public void start()
|
||||
{
|
||||
quit = false;
|
||||
running = 0;
|
||||
|
||||
for(int i = 0; i < config.getSpiderThreads(); i++)
|
||||
{
|
||||
LechLogger.info("Starting Spider thread");
|
||||
Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
|
||||
t.start();
|
||||
running++;
|
||||
}
|
||||
}
|
||||
|
||||
public void stop()
|
||||
{
|
||||
quit = true;
|
||||
}
|
||||
|
||||
public boolean isRunning()
|
||||
{
|
||||
return running == 0;
|
||||
}
|
||||
|
||||
private void checkpointIfNeeded()
|
||||
{
|
||||
if(config.getCheckpointInterval() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
|
||||
{
|
||||
synchronized(queue)
|
||||
{
|
||||
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
|
||||
{
|
||||
writeCheckpoint();
|
||||
lastCheckpoint = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeCheckpoint()
|
||||
{
|
||||
LechLogger.debug("writeCheckpoint()");
|
||||
try
|
||||
{
|
||||
FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
|
||||
ObjectOutputStream oos = new ObjectOutputStream(fos);
|
||||
oos.writeObject(queue);
|
||||
oos.writeObject(urlsDownloading);
|
||||
oos.close();
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
LechLogger.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
public void readCheckpoint()
|
||||
{
|
||||
try
|
||||
{
|
||||
FileInputStream fis = new FileInputStream("spider.checkpoint");
|
||||
ObjectInputStream ois = new ObjectInputStream(fis);
|
||||
queue = (DownloadQueue) ois.readObject();
|
||||
urlsDownloading = (Set) ois.readObject();
|
||||
queue.queueURLs(urlsDownloading);
|
||||
urlsDownloading.clear();
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
LechLogger.error("Caught exception reading checkpoint: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public void run()
|
||||
{
|
||||
HTMLParser htmlParser = new HTMLParser(config);
|
||||
URLGetter urlGetter = new URLGetter(config);
|
||||
|
||||
while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
|
||||
{
|
||||
checkpointIfNeeded();
|
||||
if(queueSize() == 0 && downloadsInProgress > 0)
|
||||
{
|
||||
// Wait for a download to finish before seeing if this thread should stop
|
||||
try
|
||||
{
|
||||
Thread.sleep(QUEUE_CHECK_INTERVAL);
|
||||
}
|
||||
catch(InterruptedException ignored)
|
||||
{
|
||||
}
|
||||
// Have another go at the loop
|
||||
continue;
|
||||
}
|
||||
else if(queueSize() == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
URLToDownload nextURL;
|
||||
synchronized(queue)
|
||||
{
|
||||
nextURL = queue.getNextInQueue();
|
||||
downloadsInProgress++;
|
||||
}
|
||||
synchronized(urlsDownloading)
|
||||
{
|
||||
urlsDownloading.add(nextURL);
|
||||
}
|
||||
int newDepth = nextURL.getDepth() + 1;
|
||||
int maxDepth = config.getMaxDepth();
|
||||
synchronized(urlsDownloading)
|
||||
{
|
||||
urlsDownloading.remove(nextURL);
|
||||
}
|
||||
List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
|
||||
|
||||
newURLs = filterURLs(newURLs);
|
||||
|
||||
ArrayList u2dsToQueue = new ArrayList();
|
||||
for(Iterator i = newURLs.iterator(); i.hasNext(); )
|
||||
{
|
||||
URL u = (URL) i.next();
|
||||
// Download if not yet downloaded, and the new depth is less than the maximum
|
||||
synchronized(urlsDownloadedOrScheduled)
|
||||
{
|
||||
if(!urlsDownloadedOrScheduled.contains(u)
|
||||
&& (maxDepth == 0 || newDepth <= maxDepth))
|
||||
{
|
||||
u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
|
||||
urlsDownloadedOrScheduled.add(u);
|
||||
}
|
||||
}
|
||||
}
|
||||
synchronized(queue)
|
||||
{
|
||||
queue.queueURLs(u2dsToQueue);
|
||||
downloadsInProgress--;
|
||||
}
|
||||
}
|
||||
LechLogger.info("Spider thread stopping [" + config.getStartLocation() + "]" );
|
||||
running--;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of the download queue in a thread-safe manner.
|
||||
*/
|
||||
private int queueSize()
|
||||
{
|
||||
synchronized(queue)
|
||||
{
|
||||
return queue.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a URL, and return new URLs that are referenced from it.
|
||||
*
|
||||
* @return A List of URL objects.
|
||||
*/
|
||||
private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
|
||||
{
|
||||
LechLogger.debug("downloadURL(" + url + ")");
|
||||
|
||||
// Bail out early if image and already on disk
|
||||
URLObject obj = new URLObject(url.getURL(), config);
|
||||
if(obj.existsOnDisk())
|
||||
{
|
||||
if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
|
||||
{
|
||||
LechLogger.info("Q: [" + queue + "] " + url);
|
||||
obj = urlGetter.getURL(url);
|
||||
}
|
||||
else if(config.refreshImages() && obj.isImage())
|
||||
{
|
||||
LechLogger.info("Q: [" + queue + "] " + url);
|
||||
obj = urlGetter.getURL(url);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LechLogger.info("Q: [" + queue + "] " + url);
|
||||
obj = urlGetter.getURL(url);
|
||||
}
|
||||
|
||||
if(obj == null)
|
||||
{
|
||||
return new ArrayList();
|
||||
}
|
||||
|
||||
if(!obj.existsOnDisk())
|
||||
{
|
||||
obj.writeToFile();
|
||||
}
|
||||
|
||||
if(obj.isHTML() || obj.isXML())
|
||||
{
|
||||
return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
|
||||
}
|
||||
else if(obj.isImage())
|
||||
{
|
||||
return new ArrayList();
|
||||
}
|
||||
else
|
||||
{
|
||||
LechLogger.warn("Unknown content type received: " + obj.getContentType());
|
||||
LechLogger.info("URL was " + url);
|
||||
return new ArrayList();
|
||||
}
|
||||
}
|
||||
|
||||
private List filterURLs(List URLs)
|
||||
{
|
||||
String match = config.getURLMatch();
|
||||
ArrayList retVal = new ArrayList();
|
||||
|
||||
synchronized(urlsDownloadedOrScheduled)
|
||||
{
|
||||
for(Iterator i = URLs.iterator(); i.hasNext(); )
|
||||
{
|
||||
URL u = (URL) i.next();
|
||||
if(urlsDownloadedOrScheduled.contains(u))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
String s = u.toString();
|
||||
if(s.indexOf(match) != -1)
|
||||
{
|
||||
retVal.add(u);
|
||||
}
|
||||
}
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/* Method By Coleman
|
||||
* A basic check to see if there is another spider downloading the same thing
|
||||
*/
|
||||
protected boolean compareSpiderConfig ( SpiderConfig sc ) {
|
||||
return config.getStartLocation().equals ( sc.getStartLocation() );
|
||||
}
|
||||
|
||||
/* Method By Coleman
|
||||
* A method to determine if one spider is downloading the same file as another spider
|
||||
*/
|
||||
public boolean equals ( Object o ) {
|
||||
if ( !o.getClass().isInstance ( this ) ) return false;
|
||||
return ((Spider) o).compareSpiderConfig ( config );
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,464 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.net.URL;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
public class SpiderConfig implements Serializable
|
||||
{
|
||||
private File saveRootDirectory;
|
||||
private File mailtoLogFile;
|
||||
|
||||
private boolean refreshHTMLs;
|
||||
private boolean refreshImages;
|
||||
private boolean refreshOthers;
|
||||
|
||||
private Set htmlExtensions;
|
||||
private Set imageExtensions;
|
||||
|
||||
private URL startLocation;
|
||||
private String urlMatch;
|
||||
|
||||
private List interestingURLSubstrings;
|
||||
private List boringURLSubstrings;
|
||||
|
||||
private boolean depthFirst;
|
||||
private int maxDepth;
|
||||
|
||||
private String userAgent;
|
||||
|
||||
private String basicAuthUser;
|
||||
private String basicAuthPassword;
|
||||
|
||||
private int spiderThreads;
|
||||
|
||||
private long checkpointInterval;
|
||||
|
||||
/**
|
||||
* Create a default config.
|
||||
*/
|
||||
public SpiderConfig()
|
||||
{
|
||||
LechLogger.debug("SpiderConfig()");
|
||||
|
||||
saveRootDirectory = new File(".");
|
||||
mailtoLogFile = new File("mailto.txt");
|
||||
|
||||
refreshHTMLs = true;
|
||||
refreshImages = false;
|
||||
refreshOthers = false;
|
||||
|
||||
htmlExtensions = new HashSet();
|
||||
htmlExtensions.add("htm");
|
||||
htmlExtensions.add("html");
|
||||
htmlExtensions.add("shtml");
|
||||
|
||||
imageExtensions = new HashSet();
|
||||
imageExtensions.add("jpg");
|
||||
imageExtensions.add("gif");
|
||||
imageExtensions.add("png");
|
||||
/* Added a few image extensions -- Coleman */
|
||||
imageExtensions.add("tiff");
|
||||
imageExtensions.add("bmp");
|
||||
|
||||
urlMatch = null;
|
||||
interestingURLSubstrings = new ArrayList();
|
||||
boringURLSubstrings = new ArrayList();
|
||||
depthFirst = false;
|
||||
maxDepth = 0;
|
||||
|
||||
userAgent = "WebLech Spider [Release C]";
|
||||
basicAuthUser = "";
|
||||
basicAuthPassword = "";
|
||||
|
||||
spiderThreads = 1;
|
||||
|
||||
checkpointInterval = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a config from a java.util.Properties object.
|
||||
*/
|
||||
public SpiderConfig(Properties props)
|
||||
{
|
||||
LechLogger.debug("SpiderConfig(props)");
|
||||
|
||||
saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
|
||||
if(!saveRootDirectory.exists())
|
||||
{
|
||||
if(!saveRootDirectory.mkdirs())
|
||||
{
|
||||
LechLogger.error("Couldn't create root directory: " + saveRootDirectory);
|
||||
LechLogger.info("Defaulting to . instead");
|
||||
saveRootDirectory = new File(".");
|
||||
}
|
||||
}
|
||||
else if(!saveRootDirectory.isDirectory())
|
||||
{
|
||||
LechLogger.error("Save root is not a directory: " + saveRootDirectory);
|
||||
LechLogger.info("Defaulting to . instead");
|
||||
saveRootDirectory = new File(".");
|
||||
}
|
||||
|
||||
String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
|
||||
// Check if absolute or relative name given
|
||||
if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
|
||||
{
|
||||
LechLogger.debug("Using absolute file name " + mailtoFileStr);
|
||||
mailtoLogFile = new File(mailtoFileStr);
|
||||
}
|
||||
else
|
||||
{
|
||||
LechLogger.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
|
||||
mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
|
||||
}
|
||||
|
||||
refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
|
||||
refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
|
||||
refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
|
||||
|
||||
htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
|
||||
imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
|
||||
|
||||
String startLocStr = props.getProperty("startLocation");
|
||||
if(startLocStr != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
startLocation = new URL(startLocStr);
|
||||
}
|
||||
catch(MalformedURLException murle)
|
||||
{
|
||||
LechLogger.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LechLogger.warn("startLocation not found in properties");
|
||||
}
|
||||
|
||||
urlMatch = props.getProperty("urlMatch");
|
||||
|
||||
interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
|
||||
boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
|
||||
|
||||
depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
|
||||
try
|
||||
{
|
||||
String maxDepthStr = props.getProperty("maxDepth", "0");
|
||||
maxDepth = Integer.parseInt(maxDepthStr);
|
||||
}
|
||||
catch(NumberFormatException nfe)
|
||||
{
|
||||
LechLogger.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
|
||||
maxDepth = 1;
|
||||
}
|
||||
|
||||
userAgent = props.getProperty("userAgent", "WebLech Spider [Version C]");
|
||||
basicAuthUser = props.getProperty("basicAuthUser", "");
|
||||
basicAuthPassword = props.getProperty("basicAuthPassword", "");
|
||||
|
||||
try
|
||||
{
|
||||
String threadsStr = props.getProperty("spiderThreads", "1");
|
||||
spiderThreads = Integer.parseInt(threadsStr);
|
||||
}
|
||||
catch(NumberFormatException nfe)
|
||||
{
|
||||
LechLogger.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
|
||||
spiderThreads = 1;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
String intervalStr = props.getProperty("checkpointInterval", "0");
|
||||
checkpointInterval = Long.parseLong(intervalStr);
|
||||
}
|
||||
catch(NumberFormatException nfe)
|
||||
{
|
||||
LechLogger.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
|
||||
spiderThreads = 1;
|
||||
}
|
||||
}
|
||||
|
||||
private List parsePropCommaSeparated(String str)
|
||||
{
|
||||
ArrayList result = new ArrayList();
|
||||
if(str != null && str.length() > 0)
|
||||
{
|
||||
StringTokenizer tok = new StringTokenizer(str, ",");
|
||||
while(tok.hasMoreTokens())
|
||||
{
|
||||
result.add(tok.nextToken());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public void setRefreshHTMLs(boolean refreshHTMLs)
|
||||
{
|
||||
this.refreshHTMLs = refreshHTMLs;
|
||||
}
|
||||
|
||||
public boolean refreshHTMLs()
|
||||
{
|
||||
return refreshHTMLs;
|
||||
}
|
||||
|
||||
public void setRefreshImages(boolean refreshImages)
|
||||
{
|
||||
this.refreshImages = refreshImages;
|
||||
}
|
||||
|
||||
public boolean refreshImages()
|
||||
{
|
||||
return refreshImages;
|
||||
}
|
||||
|
||||
public void setRefreshOthers(boolean refreshOthers)
|
||||
{
|
||||
this.refreshOthers = refreshOthers;
|
||||
}
|
||||
|
||||
public boolean refreshOthers()
|
||||
{
|
||||
return refreshOthers;
|
||||
}
|
||||
|
||||
public void setSaveRootDirectory(File saveRootDirectory)
|
||||
{
|
||||
this.saveRootDirectory = saveRootDirectory;
|
||||
}
|
||||
|
||||
public File getSaveRootDirectory()
|
||||
{
|
||||
return saveRootDirectory;
|
||||
}
|
||||
|
||||
public void setMailtoLogFile(File mailtoLogFile)
|
||||
{
|
||||
this.mailtoLogFile = mailtoLogFile;
|
||||
}
|
||||
|
||||
public File getMailtoLogFile()
|
||||
{
|
||||
return mailtoLogFile;
|
||||
}
|
||||
|
||||
public void setStartLocation(URL startLocation)
|
||||
{
|
||||
this.startLocation = startLocation;
|
||||
}
|
||||
|
||||
public URL getStartLocation()
|
||||
{
|
||||
return startLocation;
|
||||
}
|
||||
|
||||
public void setURLMatch(String urlMatch)
|
||||
{
|
||||
this.urlMatch = urlMatch;
|
||||
}
|
||||
|
||||
public String getURLMatch()
|
||||
{
|
||||
return urlMatch;
|
||||
}
|
||||
|
||||
public List getInterestingURLSubstrings()
|
||||
{
|
||||
return interestingURLSubstrings;
|
||||
}
|
||||
|
||||
public void setInterestingURLSubstrings(List interestingURLSubstrings)
|
||||
{
|
||||
this.interestingURLSubstrings = interestingURLSubstrings;
|
||||
}
|
||||
|
||||
public List getBoringURLSubstrings()
|
||||
{
|
||||
return boringURLSubstrings;
|
||||
}
|
||||
|
||||
public void setBoringURLSubstrings(List boringURLSubstrings)
|
||||
{
|
||||
this.boringURLSubstrings = boringURLSubstrings;
|
||||
}
|
||||
|
||||
public boolean isInteresting(URL u)
|
||||
{
|
||||
return matchURL(u, interestingURLSubstrings);
|
||||
}
|
||||
|
||||
public boolean isBoring(URL u)
|
||||
{
|
||||
return matchURL(u, boringURLSubstrings);
|
||||
}
|
||||
|
||||
private boolean matchURL(URL u, List substrings)
|
||||
{
|
||||
String str = u.toExternalForm();
|
||||
for(Iterator i = substrings.iterator(); i.hasNext(); )
|
||||
{
|
||||
String substr = (String) i.next();
|
||||
if(str.indexOf(substr) != -1)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void setDepthFirstSearch(boolean depthFirst)
|
||||
{
|
||||
this.depthFirst = depthFirst;
|
||||
}
|
||||
|
||||
public boolean isDepthFirstSearch()
|
||||
{
|
||||
return depthFirst;
|
||||
}
|
||||
|
||||
public void setMaxDepth(int maxDepth)
|
||||
{
|
||||
this.maxDepth = maxDepth;
|
||||
}
|
||||
|
||||
public int getMaxDepth()
|
||||
{
|
||||
return maxDepth;
|
||||
}
|
||||
|
||||
public void setUserAgent(String userAgent)
|
||||
{
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public String getUserAgent()
|
||||
{
|
||||
return userAgent;
|
||||
}
|
||||
|
||||
public void setBasicAuthUser(String basicAuthUser)
|
||||
{
|
||||
this.basicAuthUser = basicAuthUser;
|
||||
}
|
||||
|
||||
public String getBasicAuthUser()
|
||||
{
|
||||
return basicAuthUser;
|
||||
}
|
||||
|
||||
public void setBasicAuthPassword(String basicAuthPassword)
|
||||
{
|
||||
this.basicAuthPassword = basicAuthPassword;
|
||||
}
|
||||
|
||||
public String getBasicAuthPassword()
|
||||
{
|
||||
return basicAuthPassword;
|
||||
}
|
||||
|
||||
public void setSpiderThreads(int spiderThreads)
|
||||
{
|
||||
this.spiderThreads = spiderThreads;
|
||||
}
|
||||
|
||||
public int getSpiderThreads()
|
||||
{
|
||||
return spiderThreads;
|
||||
}
|
||||
|
||||
public void setCheckpointInterval(long interval)
|
||||
{
|
||||
this.checkpointInterval = interval;
|
||||
}
|
||||
|
||||
public long getCheckpointInterval()
|
||||
{
|
||||
return checkpointInterval;
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
return "depthFirst:\t" + depthFirst
|
||||
+ "\nmaxDepth:\t" + maxDepth
|
||||
+ "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
|
||||
+ "\nimageExtensions:\t" + fromSet(imageExtensions)
|
||||
+ "\nrefreshHTMLs:\t" + refreshHTMLs
|
||||
+ "\nrefreshImages:\t" + refreshImages
|
||||
+ "\nrefreshOthers:\t" + refreshOthers
|
||||
+ "\nsaveRootDirectory:\t" + saveRootDirectory
|
||||
+ "\nstartLocation:\t" + startLocation
|
||||
+ "\nurlMatch:\t" + urlMatch
|
||||
+ "\nuserAgent:\t" + userAgent
|
||||
+ "\nbasicAuthUser:\t" + basicAuthUser
|
||||
+ "\nbasicAuthPassword:\t" + "***"
|
||||
+ "\nspiderThreads:\t" + spiderThreads
|
||||
+ "\ncheckpointInterval:\t" + checkpointInterval;
|
||||
}
|
||||
|
||||
private Set parseSet(String str)
|
||||
{
|
||||
LechLogger.debug("parseSet(" + str + ")");
|
||||
HashSet result = new HashSet();
|
||||
StringTokenizer sTok = new StringTokenizer(str, ",");
|
||||
while(sTok.hasMoreTokens())
|
||||
{
|
||||
String tok = sTok.nextToken().trim();
|
||||
result.add(tok);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String fromSet(Set s)
|
||||
{
|
||||
StringBuffer sb = new StringBuffer();
|
||||
boolean first = true;
|
||||
for(Iterator i = s.iterator(); i.hasNext(); )
|
||||
{
|
||||
String str = (String) i.next();
|
||||
if(first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
sb.append(",");
|
||||
}
|
||||
sb.append(str);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
} // End class SpiderConfig
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.net.Authenticator;
|
||||
import java.io.*;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
public class URLGetter
|
||||
{
|
||||
private int failureCount = 0;
|
||||
|
||||
private final SpiderConfig config;
|
||||
|
||||
public URLGetter(SpiderConfig config)
|
||||
{
|
||||
LechLogger.debug("URLGetter()");
|
||||
this.config = config;
|
||||
|
||||
Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
|
||||
}
|
||||
|
||||
public URLObject getURL(URLToDownload url)
|
||||
{
|
||||
LechLogger.debug("getURL(" + url + ")");
|
||||
|
||||
if(failureCount > 10)
|
||||
{
|
||||
LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download");
|
||||
try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { };
|
||||
failureCount = 0;
|
||||
}
|
||||
|
||||
URL requestedURL = url.getURL();
|
||||
URL referer = url.getReferer();
|
||||
|
||||
try
|
||||
{
|
||||
LechLogger.debug("Creating HTTP connection to " + requestedURL);
|
||||
HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection();
|
||||
if(referer != null)
|
||||
{
|
||||
LechLogger.debug("Setting Referer header to " + referer);
|
||||
conn.setRequestProperty("Referer", referer.toExternalForm());
|
||||
}
|
||||
|
||||
if(config.getUserAgent() != null)
|
||||
{
|
||||
LechLogger.debug("Setting User-Agent to " + config.getUserAgent());
|
||||
conn.setRequestProperty("User-Agent", config.getUserAgent());
|
||||
}
|
||||
|
||||
conn.setUseCaches(false);
|
||||
|
||||
LechLogger.debug("Opening URL");
|
||||
long startTime = System.currentTimeMillis();
|
||||
conn.connect();
|
||||
|
||||
String resp = conn.getResponseMessage();
|
||||
LechLogger.debug("Remote server response: " + resp);
|
||||
|
||||
String respStr = conn.getHeaderField(0);
|
||||
LechLogger.info("Server response: " + respStr);
|
||||
|
||||
for(int i = 1; ; i++)
|
||||
{
|
||||
String key = conn.getHeaderFieldKey(i);
|
||||
if(key == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
String value = conn.getHeaderField(key);
|
||||
LechLogger.debug("Received header " + key + ": " + value);
|
||||
}
|
||||
|
||||
LechLogger.debug("Getting buffered input stream from remote connection");
|
||||
BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
|
||||
byte[] buf = new byte[1024];
|
||||
int bytesRead = 0;
|
||||
while(bytesRead >= 0)
|
||||
{
|
||||
baos.write(buf, 0, bytesRead);
|
||||
bytesRead = remoteBIS.read(buf);
|
||||
}
|
||||
|
||||
byte[] content = baos.toByteArray();
|
||||
long timeTaken = System.currentTimeMillis() - startTime;
|
||||
if(timeTaken < 100) timeTaken = 500;
|
||||
|
||||
int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
|
||||
LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
|
||||
if(content.length < conn.getContentLength())
|
||||
{
|
||||
LechLogger.warn("Didn't download full content for URL: " + url);
|
||||
failureCount++;
|
||||
return null;
|
||||
}
|
||||
return new URLObject(requestedURL, conn.getContentType(), content, config);
|
||||
}
|
||||
catch(FileNotFoundException fnfe) {
|
||||
LechLogger.warn("File not found: " + fnfe.getMessage());
|
||||
return null;
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
|
||||
failureCount++;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,206 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
* Copyright (c) 2004 Andrew Coleman
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
|
||||
import weblech.ui.LechLogger;
|
||||
|
||||
public class URLObject
|
||||
{
|
||||
private final URL sourceURL;
|
||||
private final String contentType;
|
||||
private final byte[] content;
|
||||
|
||||
private final SpiderConfig config;
|
||||
|
||||
public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config)
|
||||
{
|
||||
this.sourceURL = sourceURL;
|
||||
this.contentType = contentType;
|
||||
this.content = content;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public URLObject(URL sourceURL, SpiderConfig config)
|
||||
{
|
||||
this.sourceURL = sourceURL;
|
||||
this.config = config;
|
||||
|
||||
String s = sourceURL.toExternalForm().toLowerCase();
|
||||
if(s.indexOf(".jpg") != -1)
|
||||
{
|
||||
contentType = "image/jpeg";
|
||||
}
|
||||
else if(s.indexOf(".gif") != -1)
|
||||
{
|
||||
contentType = "image/gif";
|
||||
}
|
||||
else
|
||||
{
|
||||
contentType = "text/html";
|
||||
}
|
||||
|
||||
if(existsOnDisk())
|
||||
{
|
||||
|
||||
File f = new File(convertToFileName());
|
||||
if(f.isDirectory())
|
||||
{
|
||||
f = new File(f, "index.html");
|
||||
}
|
||||
content = new byte[(int) f.length()];
|
||||
try
|
||||
{
|
||||
FileInputStream in = new FileInputStream(f);
|
||||
in.read(content);
|
||||
in.close();
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
LechLogger.warn("IO Exception reading disk version of URL " + sourceURL, ioe);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
content = new byte[0];
|
||||
}
|
||||
}
|
||||
|
||||
public String getContentType()
|
||||
{
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public boolean isHTML()
|
||||
{
|
||||
return contentType.toLowerCase().startsWith("text/html");
|
||||
}
|
||||
|
||||
public boolean isXML()
|
||||
{
|
||||
return contentType.toLowerCase().startsWith("text/xml");
|
||||
}
|
||||
|
||||
public boolean isImage()
|
||||
{
|
||||
return contentType.startsWith("image/");
|
||||
}
|
||||
|
||||
public String getStringContent()
|
||||
{
|
||||
return new String(content);
|
||||
}
|
||||
|
||||
private String convertToFileName()
|
||||
{
|
||||
String url = sourceURL.toExternalForm();
|
||||
int httpIdx = url.indexOf("http://");
|
||||
if(httpIdx == 0)
|
||||
{
|
||||
url = url.substring(7);
|
||||
}
|
||||
// Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
|
||||
if(url.indexOf("/") < 0)
|
||||
{
|
||||
url = url + "/";
|
||||
}
|
||||
// If trailing slash, add index.html as default
|
||||
if(url.endsWith("/"))
|
||||
{
|
||||
url = url + "index.html";
|
||||
}
|
||||
try {
|
||||
/* the old encode method is now deprecated, updated to the new API -- Coleman */
|
||||
url = textReplace("?", URLEncoder.encode("?","UTF-8"), url);
|
||||
url = textReplace("&", URLEncoder.encode("&","UTF-8"), url);
|
||||
}
|
||||
catch ( java.io.UnsupportedEncodingException exception ) {
|
||||
LechLogger.error ( exception.toString() );
|
||||
}
|
||||
return config.getSaveRootDirectory().getPath() + "/" + url;
|
||||
}
|
||||
|
||||
public boolean existsOnDisk()
|
||||
{
|
||||
File f = new File(convertToFileName());
|
||||
return (f.exists() && !f.isDirectory());
|
||||
}
|
||||
|
||||
public void writeToFile()
|
||||
{
|
||||
writeToFile(convertToFileName());
|
||||
}
|
||||
|
||||
public void writeToFile(String fileName)
|
||||
{
|
||||
LechLogger.debug("writeToFile(" + fileName + ")");
|
||||
try
|
||||
{
|
||||
File f = new File(fileName);
|
||||
f.getParentFile().mkdirs();
|
||||
FileOutputStream out = new FileOutputStream(fileName);
|
||||
out.write(content);
|
||||
out.flush();
|
||||
out.close();
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
LechLogger.warn("IO Exception writing to " + fileName, ioe);
|
||||
}
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append("URLObject: ");
|
||||
sb.append(contentType);
|
||||
if(false)//isHTML() || isXML())
|
||||
{
|
||||
sb.append("\n");
|
||||
sb.append(getStringContent());
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String textReplace(String find, String replace, String input)
|
||||
{
|
||||
int startPos = 0;
|
||||
while(true)
|
||||
{
|
||||
int textPos = input.indexOf(find, startPos);
|
||||
if(textPos < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
input = input.substring(0, textPos) + replace + input.substring(textPos + find.length());
|
||||
startPos = textPos + replace.length();
|
||||
}
|
||||
return input;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||
*
|
||||
* Copyright (c) 2001 Brian Pitcher
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/URLToDownload.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
|
||||
|
||||
package weblech.spider;
|
||||
|
||||
import java.net.URL;
|
||||
|
||||
public class URLToDownload implements java.io.Serializable
|
||||
{
|
||||
private final URL url;
|
||||
private final URL referer;
|
||||
private final int depth;
|
||||
|
||||
public URLToDownload(URL url, int depth)
|
||||
{
|
||||
this(url, null, depth);
|
||||
}
|
||||
|
||||
public URLToDownload(URL url, URL referer, int depth)
|
||||
{
|
||||
this.url = url;
|
||||
this.referer = referer;
|
||||
this.depth = depth;
|
||||
}
|
||||
|
||||
public URL getURL()
|
||||
{
|
||||
return url;
|
||||
}
|
||||
|
||||
public URL getReferer()
|
||||
{
|
||||
return referer;
|
||||
}
|
||||
|
||||
public int getDepth()
|
||||
{
|
||||
return depth;
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
return url + ", referer " + referer + ", depth " + depth;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,138 @@
|
|||
/**
|
||||
* LechLogger.java: A Graphical Logger
|
||||
* The original weblech had a simple text interface and used an apache.org library for logging.
|
||||
* When i wanted to make this thing into a graphical appk, I realized the text logging
|
||||
* was going to have to go. It assumes you only want to log to one location (not a problem for
|
||||
* this application).
|
||||
*/
|
||||
package weblech.ui;
|
||||
|
||||
import javax.swing.JTextArea;
|
||||
import java.io.IOException;
|
||||
|
||||
public class LechLogger {
|
||||
|
||||
/**
|
||||
* The actual text area that will perform all output.
|
||||
*/
|
||||
private static JTextArea _loggerWindow;
|
||||
/**
|
||||
* These are flags for enabling different types of logging mechanisms.
|
||||
*/
|
||||
private static boolean error_enable, warn_enable, info_enable, debug_enable;
|
||||
|
||||
/**
|
||||
* Only need one initialization since this will be shared between many different
|
||||
* objects.
|
||||
*/
|
||||
static {
|
||||
_loggerWindow = null;
|
||||
error_enable = true;
|
||||
warn_enable = true;
|
||||
info_enable = true;
|
||||
debug_enable = true;
|
||||
}
|
||||
|
||||
/** Everybody wants to log, but you only need one logger! */
|
||||
public LechLogger() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the textual component to perform the logging.
|
||||
*/
|
||||
public static void setTextArea ( JTextArea textarea ) {
|
||||
_loggerWindow = textarea;
|
||||
}
|
||||
|
||||
/**
|
||||
* A private method for actually writing the messages.
|
||||
* It is synchronized because the weblech spider is multi
|
||||
* threaded.
|
||||
*/
|
||||
private static synchronized void log ( String msg ) {
|
||||
if ( _loggerWindow == null ) {
|
||||
//System.out.println ( msg );
|
||||
return;
|
||||
}
|
||||
_loggerWindow.setEditable ( true );
|
||||
_loggerWindow.append ( msg );
|
||||
_loggerWindow.append ( "\n" );
|
||||
_loggerWindow.setEditable ( false );
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle error logging.
|
||||
*/
|
||||
public static void setErrorLogging() {
|
||||
error_enable = !error_enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log an error message.
|
||||
*/
|
||||
public static void error ( String msg ) {
|
||||
if ( !error_enable ) return;
|
||||
log ( "*error>" + msg );
|
||||
}
|
||||
|
||||
/**
|
||||
* Log an error message and an exception.
|
||||
*/
|
||||
public static void error ( String msg, Exception exception ) {
|
||||
if ( !error_enable ) return;
|
||||
log ( "*error>" + msg + "\n" + exception.getMessage() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle informational messages.
|
||||
*/
|
||||
public static void setInformationalLogging() {
|
||||
info_enable = !info_enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log an informational message.
|
||||
*/
|
||||
public static void info ( String msg ) {
|
||||
if ( !info_enable ) return;
|
||||
log ( "^info>" + msg );
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle warning messages.
|
||||
*/
|
||||
public static void setWarningLogging() {
|
||||
warn_enable = !warn_enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a warning message.
|
||||
*/
|
||||
public static void warn ( String msg ) {
|
||||
if ( !warn_enable ) return;
|
||||
log ( "-warn>" + msg );
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a warning message with an exception.
|
||||
*/
|
||||
public static void warn ( String msg, IOException exception ) {
|
||||
if ( !warn_enable ) return;
|
||||
log ( "-warn>" + msg + "\n" + exception.getMessage() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle debug messages to be printed.
|
||||
*/
|
||||
public static void setDebugLogging() {
|
||||
debug_enable = !debug_enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a deubgging statement to the logging text area.
|
||||
*/
|
||||
public static void debug ( String msg ) {
|
||||
if ( !debug_enable ) return;
|
||||
log ( "@debug>" + msg );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
/**
|
||||
* SpiderConfigPanel.java: A graphcial panel for configuring a SpiderConfig object.
|
||||
* This panel provides a way to change the more practical options of the WebLech Spider.
|
||||
* It supports saving and opening of SpiderConfigurations from a file. It does not use
|
||||
* any of the "interesting" or "boring" url features, or the email link save file.
|
||||
*/
|
||||
package weblech.ui;
|
||||
|
||||
import weblech.spider.Spider;
|
||||
import weblech.spider.SpiderConfig;
|
||||
|
||||
import javax.swing.JPanel;
|
||||
import javax.swing.JTextField;
|
||||
import javax.swing.JLabel;
|
||||
import javax.swing.JButton;
|
||||
import javax.swing.JComboBox;
|
||||
import javax.swing.JFileChooser;
|
||||
|
||||
import java.awt.GridLayout;
|
||||
import java.awt.FlowLayout;
|
||||
|
||||
import java.awt.event.ActionListener;
|
||||
import java.awt.event.ActionEvent;
|
||||
|
||||
import java.util.Properties;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
|
||||
public class SpiderConfigPanel extends JPanel implements ActionListener {
|
||||
|
||||
/**
|
||||
* A list of all of the spiders that the GUI will have downloading. It is assumed
|
||||
* that the user knows how much bandwidth you have and really wants to try and get
|
||||
* several different sites at the same time.
|
||||
*/
|
||||
private ArrayList spiders;
|
||||
/**
|
||||
* Various text fields for the configuration options.
|
||||
*/
|
||||
private JTextField sitenamefield, dirfield, usernamefield, passwordfield, agentfield, depthfield, matchfield;
|
||||
/**
|
||||
* A selection box for the number of threads a new Spider should use, I am limiting
|
||||
* the number of threads to 4 for simplicity.
|
||||
*/
|
||||
private JComboBox threadbox;
|
||||
|
||||
public SpiderConfigPanel() {
|
||||
super ( new GridLayout ( 8, 1 ) );
|
||||
spiders = new ArrayList();
|
||||
|
||||
/* Panel for the directory to save all files */
|
||||
JPanel sitepanel = new JPanel();
|
||||
((FlowLayout) sitepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel sitelabel = new JLabel ( "Output Directory:" );
|
||||
dirfield = new JTextField ( System.getProperty ( "user.home" ), 20 );
|
||||
sitepanel.add ( sitelabel );
|
||||
sitepanel.add ( dirfield );
|
||||
|
||||
/* Panel for the site to download */
|
||||
JPanel outputpanel = new JPanel();
|
||||
((FlowLayout) outputpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel dirlabel = new JLabel ( "Download Site:" );
|
||||
sitenamefield = new JTextField ( "http://www.google.com/", 20 );
|
||||
outputpanel.add ( dirlabel );
|
||||
outputpanel.add ( sitenamefield );
|
||||
|
||||
/* Panel for the HTTP username */
|
||||
JPanel usernamepanel = new JPanel();
|
||||
((FlowLayout) usernamepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel usernamelabel = new JLabel ( "Username:" );
|
||||
usernamefield = new JTextField ( "", 20 );
|
||||
usernamepanel.add ( usernamelabel );
|
||||
usernamepanel.add ( usernamefield );
|
||||
|
||||
/* Panel for the HTTP password */
|
||||
JPanel passpanel = new JPanel();
|
||||
((FlowLayout) passpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel passwdlabel = new JLabel ( "Password:" );
|
||||
passwordfield = new JTextField ( "", 20 );
|
||||
passpanel.add ( passwdlabel );
|
||||
passpanel.add ( passwordfield );
|
||||
|
||||
/* Panel for the HTTP user agent */
|
||||
JPanel agentpanel = new JPanel();
|
||||
((FlowLayout) agentpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel agentlabel = new JLabel ( "User Agent:" );
|
||||
agentfield = new JTextField ( "WebLech [Version C]", 20 );
|
||||
agentpanel.add ( agentlabel );
|
||||
agentpanel.add ( agentfield );
|
||||
|
||||
/* Panel for a simple string match downloading limiter (no match, no download) */
|
||||
JPanel matchpanel = new JPanel();
|
||||
((FlowLayout) matchpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel matchlabel = new JLabel ( "Match String:" );
|
||||
matchfield = new JTextField ( "", 20 );
|
||||
matchpanel.add ( matchlabel );
|
||||
matchpanel.add ( matchfield );
|
||||
|
||||
/* Provides a panel for placing both the maximum depth and threads for this spider */
|
||||
JPanel detailpanel = new JPanel ( new GridLayout ( 1, 2 ) );
|
||||
|
||||
JPanel depthpanel = new JPanel();
|
||||
((FlowLayout) depthpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||
JLabel depthlabel = new JLabel ( "Max Depth:" );
|
||||
depthfield = new JTextField ( Integer.toString ( 0 ), 5 );
|
||||
depthpanel.add ( depthlabel );
|
||||
depthpanel.add ( depthfield );
|
||||
|
||||
JPanel threadpanel = new JPanel();
|
||||
JLabel threadlabel = new JLabel ( "Spider Threads:" );
|
||||
Integer[] threaditems = { new Integer ( "1" ), new Integer ( "2" ), new Integer ( "3" ), new Integer ( "4" ) };
|
||||
threadbox = new JComboBox ( threaditems );
|
||||
threadpanel.add ( threadlabel );
|
||||
threadpanel.add ( threadbox );
|
||||
|
||||
detailpanel.add ( depthpanel );
|
||||
detailpanel.add ( threadpanel );
|
||||
|
||||
/* Panel of buttons for various operations */
|
||||
JPanel buttonpanel = new JPanel();
|
||||
JButton save = new JButton ( "Save" );
|
||||
JButton spiderbutton = new JButton ( "Spider It" );
|
||||
JButton open = new JButton ( "Open" );
|
||||
JButton qbutton = new JButton ( "Quit" );
|
||||
buttonpanel.add ( save );
|
||||
buttonpanel.add ( spiderbutton );
|
||||
buttonpanel.add ( open );
|
||||
buttonpanel.add ( qbutton );
|
||||
|
||||
add ( sitepanel );
|
||||
add ( outputpanel );
|
||||
add ( usernamepanel );
|
||||
add ( passpanel );
|
||||
add ( agentpanel );
|
||||
add ( matchpanel );
|
||||
add ( detailpanel );
|
||||
add ( buttonpanel );
|
||||
|
||||
/* Configure the button actions */
|
||||
save.setActionCommand ( "save" );
|
||||
open.setActionCommand ( "open" );
|
||||
spiderbutton.setActionCommand ( "spider" );
|
||||
qbutton.setActionCommand ( "quit" );
|
||||
save.addActionListener ( this );
|
||||
open.addActionListener ( this );
|
||||
spiderbutton.addActionListener ( this );
|
||||
qbutton.addActionListener ( this );
|
||||
LechLogger.debug ( "Actions" );
|
||||
}
|
||||
|
||||
/**
|
||||
* This method will create a Properties object good for instantiating a new SpiderConfig
|
||||
* Object.
|
||||
*/
|
||||
private Properties createProperties() {
|
||||
Properties p = new Properties();
|
||||
p.setProperty ( "saveRootDirectory", dirfield.getText() );
|
||||
p.setProperty ( "startLocation", sitenamefield.getText() );
|
||||
p.setProperty ( "basicAuthUser", usernamefield.getText() );
|
||||
p.setProperty ( "basicAuthPassword", passwordfield.getText() );
|
||||
p.setProperty ( "urlMatch", matchfield.getText() );
|
||||
p.setProperty ( "spiderThreads", ((Integer) threadbox.getSelectedItem()).toString() );
|
||||
p.setProperty ( "maxDepth", depthfield.getText() );
|
||||
p.setProperty ( "userAgent", agentfield.getText() );
|
||||
p.setProperty ( "interestingURLs", "" );
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method will extract all of the values from a SpiderConfig object that the GUI uses
|
||||
* and updates the panel to show the values in the object.
|
||||
*/
|
||||
private void setSpiderConfig ( SpiderConfig sc ) {
|
||||
dirfield.setText ( sc.getSaveRootDirectory().toString() );
|
||||
sitenamefield.setText ( sc.getStartLocation().toString() );
|
||||
usernamefield.setText ( sc.getBasicAuthUser() );
|
||||
passwordfield.setText ( sc.getBasicAuthPassword() );
|
||||
matchfield.setText ( sc.getURLMatch() );
|
||||
int t = sc.getSpiderThreads();
|
||||
if ( t < 1 || t > 4 ) {
|
||||
t = 1;
|
||||
sc.setSpiderThreads ( t );
|
||||
}
|
||||
threadbox.setSelectedIndex ( t - 1 );
|
||||
depthfield.setText ( Integer.toString ( sc.getMaxDepth() ) );
|
||||
agentfield.setText ( sc.getUserAgent() );
|
||||
}
|
||||
|
||||
/**
|
||||
* This method will coordinate all of the actions for the various buttons used.
|
||||
*/
|
||||
public void actionPerformed ( ActionEvent event ) {
|
||||
String cmd = event.getActionCommand();
|
||||
/* Download a new site */
|
||||
if ( cmd.equals ( "spider" ) ) {
|
||||
SpiderConfig c = new SpiderConfig ( createProperties() );
|
||||
Spider spider = new Spider ( c );
|
||||
/* But only if we are not already downloading the site */
|
||||
if ( spiders.contains ( spider ) ) {
|
||||
LechLogger.warn ( "Already have an instance of a Spider at " + c.getStartLocation() );
|
||||
return;
|
||||
}
|
||||
spiders.add ( spider );
|
||||
spider.start();
|
||||
}
|
||||
/* Save the current configuration to a file */
|
||||
else if ( cmd.equals ( "save" ) ) {
|
||||
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
|
||||
int r = f.showSaveDialog ( this );
|
||||
if ( r != JFileChooser.APPROVE_OPTION ) return;
|
||||
File outfile = f.getSelectedFile();
|
||||
try {
|
||||
ObjectOutputStream os = new ObjectOutputStream ( new FileOutputStream ( outfile ) );
|
||||
os.writeObject ( new SpiderConfig ( createProperties() ) );
|
||||
os.close();
|
||||
}
|
||||
catch ( Exception exception ) {
|
||||
LechLogger.error ( exception.toString() );
|
||||
}
|
||||
}
|
||||
/* Open a saved configuration from a file */
|
||||
else if ( cmd.equals ( "open" ) ) {
|
||||
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
|
||||
int r = f.showOpenDialog ( this );
|
||||
if ( r != JFileChooser.APPROVE_OPTION ) return;
|
||||
File infile = f.getSelectedFile();
|
||||
if ( !infile.canRead() ) {
|
||||
LechLogger.error ( "file " + f.toString() + " is not readable" );
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ObjectInputStream os = new ObjectInputStream ( new FileInputStream ( infile ) );
|
||||
SpiderConfig sc = (SpiderConfig) os.readObject();
|
||||
os.close();
|
||||
setSpiderConfig ( sc );
|
||||
}
|
||||
catch ( Exception exception ) {
|
||||
LechLogger.error ( exception.toString() );
|
||||
}
|
||||
}
|
||||
/* Just quit */
|
||||
else if ( cmd.equals ( "quit" ) ) {
|
||||
System.exit ( 0 );
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
/**
|
||||
* Troll.java: A user interface to the weblech spider download utility.
|
||||
*/
|
||||
package weblech.ui;
|
||||
|
||||
/* I like to explicitly import all of my packages to remind me to KISS */
|
||||
import javax.swing.JFrame;
|
||||
import javax.swing.JPanel;
|
||||
import javax.swing.JTabbedPane;
|
||||
import javax.swing.JTextArea;
|
||||
import javax.swing.JScrollPane;
|
||||
import javax.swing.JMenuBar;
|
||||
import javax.swing.JMenu;
|
||||
import javax.swing.JMenuItem;
|
||||
import javax.swing.JCheckBoxMenuItem;
|
||||
|
||||
import java.awt.Dimension;
|
||||
|
||||
import java.awt.event.ActionListener;
|
||||
import java.awt.event.ActionEvent;
|
||||
|
||||
public class Troll extends JFrame implements ActionListener {
|
||||
|
||||
/**
|
||||
* This SpiderConfigPanel is a custom panel that provides many of the more
|
||||
* practical features of the weblech spider. It also controls the spiders
|
||||
* created by the user.
|
||||
*/
|
||||
private SpiderConfigPanel configpanel;
|
||||
/**
|
||||
* This is the area that all of the logging facilities will use. This makes debugging
|
||||
* in a system like Max OS X much simpler (:^)
|
||||
*/
|
||||
private static JTextArea logarea;
|
||||
|
||||
/* This just initializes the logging text box and readies it for recording events before
|
||||
* the rest of the object is even loaded.
|
||||
*/
|
||||
static {
|
||||
logarea = new JTextArea();
|
||||
LechLogger.setTextArea ( logarea );
|
||||
LechLogger.setDebugLogging();
|
||||
}
|
||||
|
||||
Troll() {
|
||||
super ( "Troll" );
|
||||
Dimension initialsize = new Dimension ( 400, 375 );
|
||||
setSize ( initialsize);
|
||||
|
||||
/* Create a menubar for controlling which aspects of the log you wish to see */
|
||||
JMenuBar menubar = new JMenuBar();
|
||||
JMenu logmenu = new JMenu ( "Log Options" );
|
||||
JCheckBoxMenuItem showdebug = new JCheckBoxMenuItem ( "Show Debug Messages", false );
|
||||
showdebug.setActionCommand ( "debug" );
|
||||
showdebug.addActionListener ( this );
|
||||
JCheckBoxMenuItem showinfo = new JCheckBoxMenuItem ( "Show Informational Messages", true );
|
||||
showinfo.setActionCommand ( "info" );
|
||||
showinfo.addActionListener ( this );
|
||||
JCheckBoxMenuItem showwarn = new JCheckBoxMenuItem ( "Show Warnings", true );
|
||||
showwarn.setActionCommand ( "warn" );
|
||||
showwarn.addActionListener ( this );
|
||||
JCheckBoxMenuItem showerror = new JCheckBoxMenuItem ( "Show Errors", true );
|
||||
showerror.setActionCommand ( "error" );
|
||||
showerror.addActionListener ( this );
|
||||
logmenu.add ( showdebug );
|
||||
logmenu.add ( showinfo );
|
||||
logmenu.add ( showwarn );
|
||||
logmenu.add ( showerror );
|
||||
menubar.add ( logmenu );
|
||||
|
||||
/* A simple tab interface between configuration and error checking */
|
||||
configpanel = new SpiderConfigPanel();
|
||||
JPanel logpanel = new JPanel();
|
||||
logpanel.add ( logarea );
|
||||
JScrollPane logscroller = new JScrollPane ( logpanel );
|
||||
JTabbedPane tabs = new JTabbedPane();
|
||||
tabs.addTab ( "Spider", configpanel );
|
||||
tabs.addTab ( "Log", logscroller );
|
||||
|
||||
/* Configure the JFrame to a usable state */
|
||||
setJMenuBar ( menubar );
|
||||
getContentPane().add ( tabs );
|
||||
setLocationRelativeTo ( null );
|
||||
setVisible ( true );
|
||||
setDefaultCloseOperation ( JFrame.EXIT_ON_CLOSE );
|
||||
}
|
||||
|
||||
/**
|
||||
* This method basically toggles all of the logging options.
|
||||
*/
|
||||
public void actionPerformed ( ActionEvent event ) {
|
||||
String cmd = event.getActionCommand();
|
||||
if ( cmd.equals ( "debug" ) ) {
|
||||
LechLogger.setDebugLogging();
|
||||
}
|
||||
else if ( cmd.equals ( "info" ) ) {
|
||||
LechLogger.setInformationalLogging();
|
||||
}
|
||||
else if ( cmd.equals ( "warn" ) ) {
|
||||
LechLogger.setDebugLogging();
|
||||
}
|
||||
else if ( cmd.equals ( "error" ) ) {
|
||||
LechLogger.setErrorLogging();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new troll and go
|
||||
*/
|
||||
public static void main ( String[] args ) {
|
||||
Troll t = new Troll();
|
||||
}
|
||||
}
|
Reference in New Issue