Initial revision

master
mercury 2004-03-07 20:51:05 +00:00
commit 80c48012cd
14 changed files with 2167 additions and 0 deletions

26
LICENSE.txt Executable file
View File

@ -0,0 +1,26 @@
WebLech license information.
============================
This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
Copyright (c) 2001 Brian Pitcher
Copyright (c) 2004 Andrew Coleman
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

31
build.xml Executable file
View File

@ -0,0 +1,31 @@
<project name="WebLech" default="dist" basedir=".">
<description>
WebLech - a tool for downloading the web
</description>
<property name="src" location="weblech" />
<property name="build" location="build" />
<property name="dist" location="jars" />
<target name="init">
<tstamp />
<mkdir dir="${build}" />
</target>
<target name="compile" depends="init">
<javac srcdir="${src}" destdir="${build}" />
</target>
<target name="dist" depends="compile">
<mkdir dir="${dist}" />
<manifest file="${dist}/MANIFEST.MF">
<attribute name="Main-Class" value="weblech.ui.Troll" />
</manifest>
<jar manifest="${dist}/MANIFEST.MF" jarfile="${dist}/WebLech-${DSTAMP}.jar" basedir="${build}" />
</target>
<target name="clean">
<delete dir="${build}" />
<delete dir="${dist}" />
</target>
</project>

18
weblech/spider/Constants.java Executable file
View File

@ -0,0 +1,18 @@
/*
* Created by IntelliJ IDEA.
* User: Michael Mason
* Date: Jun 5, 2002
* Time: 6:43:04 PM
* To change template for new interface use
* Code Style | Class Templates options (Tools | IDE Options).
*/
package weblech.spider;
public interface Constants
{
/** How often to check the queue status */
int QUEUE_CHECK_INTERVAL = 500;
/** How long to pause for threads to finish before exitting */
int SPIDER_STOP_PAUSE = 500;
}

143
weblech/spider/DownloadQueue.java Executable file
View File

@ -0,0 +1,143 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/DownloadQueue.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
package weblech.spider;
import java.util.*;
import java.net.URL;
import java.io.Serializable;
public class DownloadQueue implements Serializable
{
private SpiderConfig config;
private List interestingURLsToDownload;
private List averageURLsToDownload;
private List boringURLsToDownload;
private Set urlsInQueue;
public DownloadQueue(SpiderConfig config)
{
this.config = config;
interestingURLsToDownload = new ArrayList();
averageURLsToDownload = new ArrayList();
boringURLsToDownload = new ArrayList();
urlsInQueue = new HashSet();
}
public void queueURL(URLToDownload url)
{
URL u = url.getURL();
if(urlsInQueue.contains(u))
{
return;
}
if(config.isInteresting(u))
{
if(config.isDepthFirstSearch())
{
interestingURLsToDownload.add(0, url);
}
else
{
interestingURLsToDownload.add(url);
}
}
else if(config.isBoring(u))
{
if(config.isDepthFirstSearch())
{
boringURLsToDownload.add(0, url);
}
else
{
boringURLsToDownload.add(url);
}
}
else
{
if(config.isDepthFirstSearch())
{
averageURLsToDownload.add(0, url);
}
else
{
averageURLsToDownload.add(url);
}
}
urlsInQueue.add(u);
}
public void queueURLs(Collection urls)
{
for(Iterator i = urls.iterator(); i.hasNext(); )
{
URLToDownload u2d = (URLToDownload) i.next();
queueURL(u2d);
}
}
public URLToDownload getNextInQueue()
{
if(interestingURLsToDownload.size() > 0)
{
return returnURLFrom(interestingURLsToDownload);
}
else if(averageURLsToDownload.size() > 0)
{
return returnURLFrom(averageURLsToDownload);
}
else if(boringURLsToDownload.size() > 0)
{
return returnURLFrom(boringURLsToDownload);
}
else
{
return null;
}
}
private URLToDownload returnURLFrom(List urlList)
{
URLToDownload u2d = (URLToDownload) urlList.get(0);
urlList.remove(0);
urlsInQueue.remove(u2d.getURL());
return u2d;
}
public int size()
{
return interestingURLsToDownload.size() + averageURLsToDownload.size() + boringURLsToDownload.size();
}
public String toString()
{
return size() + " URLs";
}
} // End class DownloadQueue

View File

@ -0,0 +1,50 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import java.net.Authenticator;
import java.net.PasswordAuthentication;
import weblech.ui.LechLogger;
public class DumbAuthenticator extends Authenticator
{
private final String user;
private final String password;
public DumbAuthenticator(String user, String password)
{
LechLogger.debug("DumbAuthenticator(" + user + ", ***)");
this.user = user;
this.password = password;
}
public PasswordAuthentication getPasswordAuthentication()
{
LechLogger.debug("getPasswordAuthentication()");
return new PasswordAuthentication(user, password.toCharArray());
}
}

188
weblech/spider/HTMLParser.java Executable file
View File

@ -0,0 +1,188 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import java.util.List;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.FileWriter;
import java.io.PrintWriter;
import weblech.ui.LechLogger;
public class HTMLParser
{
private SpiderConfig config;
public HTMLParser(SpiderConfig config)
{
this.config = config;
}
public List parseLinksInDocument(URL sourceURL, String textContent)
{
return parseAsHTML(sourceURL, textContent);
}
private List parseAsHTML(URL sourceURL, String textContent)
{
LechLogger.debug("parseAsHTML()");
ArrayList newURLs = new ArrayList();
HashSet newURLSet = new HashSet();
/* note from coleman:
* I had to add a few tags into this, namely the link and embeds. weblech should download flash
* movies, mpegs, avis, and anything else that it finds on the page. even stylesheets :)
*/
extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("link", "href", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("embed", "src", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("LINK", "HREF", sourceURL, newURLs, newURLSet, textContent);
extractAttributesFromTags("EMBED", "SRC", sourceURL, newURLs, newURLSet, textContent);
if(newURLs.size() == 0)
{
LechLogger.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
}
LechLogger.debug("Returning " + newURLs.size() + " urls extracted from page");
return newURLs;
}
private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input)
{
LechLogger.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");
int startPos = 0;
String startTag = "<" + tag + " ";
String attrStr = attr + "=\"";
while(true)
{
int tagPos = input.indexOf(startTag, startPos);
if(tagPos < 0)
{
return;
}
int attrPos = input.indexOf(attrStr, tagPos + 1);
if(attrPos < 0)
{
startPos = tagPos + 1;
continue;
}
int nextClosePos = input.indexOf(">", tagPos + 1);
if(attrPos < nextClosePos)
{
// Ooh, found one
int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);
if(closeQuotePos > 0)
{
String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);
if(urlStr.indexOf('#') != -1)
{
urlStr = urlStr.substring(0, urlStr.indexOf('#'));
}
//LechLogger.debug("Found possible URL string: " + URL);
if(isMailTo(urlStr))
{
logMailURL(urlStr);
}
else
{
try
{
URL u = new URL(sourceURL, urlStr);
if(newURLSet.contains(u))
{
//LechLogger.debug("Already found URL on page: " + u);
}
else
{
newURLs.add(u);
newURLSet.add(u);
//LechLogger.debug("Found new URL on page: " + u);
}
}
catch(MalformedURLException murle)
{
}
}
}
startPos = tagPos + 1;
continue;
}
else
{
startPos = tagPos + 1;
continue;
}
}
}
private void logMailURL(String url)
{
LechLogger.debug("logMailURL()");
try
{
FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);
PrintWriter pW = new PrintWriter(appendedFile);
pW.println(url);
pW.flush();
pW.close();
}
catch(IOException ioe)
{
LechLogger.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
}
}
/**
* Check if a particular URL looks like it's a mailto: style link.
*/
private boolean isMailTo(String url)
{
if(url == null)
{
return false;
}
url = url.toUpperCase();
return (url.indexOf("MAILTO:") != -1);
}
}

333
weblech/spider/Spider.java Executable file
View File

@ -0,0 +1,333 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import weblech.ui.LechLogger;
import java.util.*;
import java.io.*;
import java.net.URL;
public class Spider implements Runnable, Constants
{
/** Config for the spider */
private SpiderConfig config;
/**
* Download queue.
* Thread safety: To access the queue, first synchronize on it.
*/
private DownloadQueue queue;
/**
* Set of URLs downloaded or scheduled, so we don't download a
* URL more than once.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloadedOrScheduled;
/**
* Set of URLs currently being downloaded by Spider threads.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloading;
/**
* Number of downloads currently taking place.
* Thread safety: To modify this value, first synchronize on
* the download queue.
*/
private int downloadsInProgress;
/** Whether the spider should quit */
private boolean quit;
/** Count of running Spider threads. */
private int running;
/** Time we last checkpointed. */
private long lastCheckpoint;
public Spider(SpiderConfig config)
{
this.config = config;
queue = new DownloadQueue(config);
queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
urlsDownloadedOrScheduled = new HashSet();
urlsDownloading = new HashSet();
downloadsInProgress = 0;
lastCheckpoint = 0;
}
public void start()
{
quit = false;
running = 0;
for(int i = 0; i < config.getSpiderThreads(); i++)
{
LechLogger.info("Starting Spider thread");
Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
t.start();
running++;
}
}
public void stop()
{
quit = true;
}
public boolean isRunning()
{
return running == 0;
}
private void checkpointIfNeeded()
{
if(config.getCheckpointInterval() == 0)
{
return;
}
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
synchronized(queue)
{
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
writeCheckpoint();
lastCheckpoint = System.currentTimeMillis();
}
}
}
}
private void writeCheckpoint()
{
LechLogger.debug("writeCheckpoint()");
try
{
FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(queue);
oos.writeObject(urlsDownloading);
oos.close();
}
catch(IOException ioe)
{
LechLogger.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
}
}
public void readCheckpoint()
{
try
{
FileInputStream fis = new FileInputStream("spider.checkpoint");
ObjectInputStream ois = new ObjectInputStream(fis);
queue = (DownloadQueue) ois.readObject();
urlsDownloading = (Set) ois.readObject();
queue.queueURLs(urlsDownloading);
urlsDownloading.clear();
}
catch(Exception e)
{
LechLogger.error("Caught exception reading checkpoint: " + e.getMessage(), e);
}
}
public void run()
{
HTMLParser htmlParser = new HTMLParser(config);
URLGetter urlGetter = new URLGetter(config);
while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
{
checkpointIfNeeded();
if(queueSize() == 0 && downloadsInProgress > 0)
{
// Wait for a download to finish before seeing if this thread should stop
try
{
Thread.sleep(QUEUE_CHECK_INTERVAL);
}
catch(InterruptedException ignored)
{
}
// Have another go at the loop
continue;
}
else if(queueSize() == 0)
{
break;
}
URLToDownload nextURL;
synchronized(queue)
{
nextURL = queue.getNextInQueue();
downloadsInProgress++;
}
synchronized(urlsDownloading)
{
urlsDownloading.add(nextURL);
}
int newDepth = nextURL.getDepth() + 1;
int maxDepth = config.getMaxDepth();
synchronized(urlsDownloading)
{
urlsDownloading.remove(nextURL);
}
List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
newURLs = filterURLs(newURLs);
ArrayList u2dsToQueue = new ArrayList();
for(Iterator i = newURLs.iterator(); i.hasNext(); )
{
URL u = (URL) i.next();
// Download if not yet downloaded, and the new depth is less than the maximum
synchronized(urlsDownloadedOrScheduled)
{
if(!urlsDownloadedOrScheduled.contains(u)
&& (maxDepth == 0 || newDepth <= maxDepth))
{
u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
urlsDownloadedOrScheduled.add(u);
}
}
}
synchronized(queue)
{
queue.queueURLs(u2dsToQueue);
downloadsInProgress--;
}
}
LechLogger.info("Spider thread stopping [" + config.getStartLocation() + "]" );
running--;
}
/**
* Get the size of the download queue in a thread-safe manner.
*/
private int queueSize()
{
synchronized(queue)
{
return queue.size();
}
}
/**
* Get a URL, and return new URLs that are referenced from it.
*
* @return A List of URL objects.
*/
private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
{
LechLogger.debug("downloadURL(" + url + ")");
// Bail out early if image and already on disk
URLObject obj = new URLObject(url.getURL(), config);
if(obj.existsOnDisk())
{
if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
else if(config.refreshImages() && obj.isImage())
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
}
else
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
if(obj == null)
{
return new ArrayList();
}
if(!obj.existsOnDisk())
{
obj.writeToFile();
}
if(obj.isHTML() || obj.isXML())
{
return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
}
else if(obj.isImage())
{
return new ArrayList();
}
else
{
LechLogger.warn("Unknown content type received: " + obj.getContentType());
LechLogger.info("URL was " + url);
return new ArrayList();
}
}
private List filterURLs(List URLs)
{
String match = config.getURLMatch();
ArrayList retVal = new ArrayList();
synchronized(urlsDownloadedOrScheduled)
{
for(Iterator i = URLs.iterator(); i.hasNext(); )
{
URL u = (URL) i.next();
if(urlsDownloadedOrScheduled.contains(u))
{
continue;
}
String s = u.toString();
if(s.indexOf(match) != -1)
{
retVal.add(u);
}
}
}
return retVal;
}
/* Method By Coleman
* A basic check to see if there is another spider downloading the same thing
*/
protected boolean compareSpiderConfig ( SpiderConfig sc ) {
return config.getStartLocation().equals ( sc.getStartLocation() );
}
/* Method By Coleman
* A method to determine if one spider is downloading the same file as another spider
*/
public boolean equals ( Object o ) {
if ( !o.getClass().isInstance ( this ) ) return false;
return ((Spider) o).compareSpiderConfig ( config );
}
}

464
weblech/spider/SpiderConfig.java Executable file
View File

@ -0,0 +1,464 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import weblech.ui.LechLogger;
import java.io.File;
import java.io.Serializable;
import java.util.*;
import java.net.URL;
import java.net.MalformedURLException;
public class SpiderConfig implements Serializable
{
private File saveRootDirectory;
private File mailtoLogFile;
private boolean refreshHTMLs;
private boolean refreshImages;
private boolean refreshOthers;
private Set htmlExtensions;
private Set imageExtensions;
private URL startLocation;
private String urlMatch;
private List interestingURLSubstrings;
private List boringURLSubstrings;
private boolean depthFirst;
private int maxDepth;
private String userAgent;
private String basicAuthUser;
private String basicAuthPassword;
private int spiderThreads;
private long checkpointInterval;
/**
* Create a default config.
*/
public SpiderConfig()
{
LechLogger.debug("SpiderConfig()");
saveRootDirectory = new File(".");
mailtoLogFile = new File("mailto.txt");
refreshHTMLs = true;
refreshImages = false;
refreshOthers = false;
htmlExtensions = new HashSet();
htmlExtensions.add("htm");
htmlExtensions.add("html");
htmlExtensions.add("shtml");
imageExtensions = new HashSet();
imageExtensions.add("jpg");
imageExtensions.add("gif");
imageExtensions.add("png");
/* Added a few image extensions -- Coleman */
imageExtensions.add("tiff");
imageExtensions.add("bmp");
urlMatch = null;
interestingURLSubstrings = new ArrayList();
boringURLSubstrings = new ArrayList();
depthFirst = false;
maxDepth = 0;
userAgent = "WebLech Spider [Release C]";
basicAuthUser = "";
basicAuthPassword = "";
spiderThreads = 1;
checkpointInterval = 0;
}
/**
* Create a config from a java.util.Properties object.
*/
public SpiderConfig(Properties props)
{
LechLogger.debug("SpiderConfig(props)");
saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
if(!saveRootDirectory.exists())
{
if(!saveRootDirectory.mkdirs())
{
LechLogger.error("Couldn't create root directory: " + saveRootDirectory);
LechLogger.info("Defaulting to . instead");
saveRootDirectory = new File(".");
}
}
else if(!saveRootDirectory.isDirectory())
{
LechLogger.error("Save root is not a directory: " + saveRootDirectory);
LechLogger.info("Defaulting to . instead");
saveRootDirectory = new File(".");
}
String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
// Check if absolute or relative name given
if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
{
LechLogger.debug("Using absolute file name " + mailtoFileStr);
mailtoLogFile = new File(mailtoFileStr);
}
else
{
LechLogger.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
}
refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
String startLocStr = props.getProperty("startLocation");
if(startLocStr != null)
{
try
{
startLocation = new URL(startLocStr);
}
catch(MalformedURLException murle)
{
LechLogger.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
}
}
else
{
LechLogger.warn("startLocation not found in properties");
}
urlMatch = props.getProperty("urlMatch");
interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
try
{
String maxDepthStr = props.getProperty("maxDepth", "0");
maxDepth = Integer.parseInt(maxDepthStr);
}
catch(NumberFormatException nfe)
{
LechLogger.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
maxDepth = 1;
}
userAgent = props.getProperty("userAgent", "WebLech Spider [Version C]");
basicAuthUser = props.getProperty("basicAuthUser", "");
basicAuthPassword = props.getProperty("basicAuthPassword", "");
try
{
String threadsStr = props.getProperty("spiderThreads", "1");
spiderThreads = Integer.parseInt(threadsStr);
}
catch(NumberFormatException nfe)
{
LechLogger.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
spiderThreads = 1;
}
try
{
String intervalStr = props.getProperty("checkpointInterval", "0");
checkpointInterval = Long.parseLong(intervalStr);
}
catch(NumberFormatException nfe)
{
LechLogger.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
spiderThreads = 1;
}
}
private List parsePropCommaSeparated(String str)
{
ArrayList result = new ArrayList();
if(str != null && str.length() > 0)
{
StringTokenizer tok = new StringTokenizer(str, ",");
while(tok.hasMoreTokens())
{
result.add(tok.nextToken());
}
}
return result;
}
public void setRefreshHTMLs(boolean refreshHTMLs)
{
this.refreshHTMLs = refreshHTMLs;
}
public boolean refreshHTMLs()
{
return refreshHTMLs;
}
public void setRefreshImages(boolean refreshImages)
{
this.refreshImages = refreshImages;
}
public boolean refreshImages()
{
return refreshImages;
}
public void setRefreshOthers(boolean refreshOthers)
{
this.refreshOthers = refreshOthers;
}
public boolean refreshOthers()
{
return refreshOthers;
}
public void setSaveRootDirectory(File saveRootDirectory)
{
this.saveRootDirectory = saveRootDirectory;
}
public File getSaveRootDirectory()
{
return saveRootDirectory;
}
public void setMailtoLogFile(File mailtoLogFile)
{
this.mailtoLogFile = mailtoLogFile;
}
public File getMailtoLogFile()
{
return mailtoLogFile;
}
public void setStartLocation(URL startLocation)
{
this.startLocation = startLocation;
}
public URL getStartLocation()
{
return startLocation;
}
public void setURLMatch(String urlMatch)
{
this.urlMatch = urlMatch;
}
public String getURLMatch()
{
return urlMatch;
}
public List getInterestingURLSubstrings()
{
return interestingURLSubstrings;
}
public void setInterestingURLSubstrings(List interestingURLSubstrings)
{
this.interestingURLSubstrings = interestingURLSubstrings;
}
public List getBoringURLSubstrings()
{
return boringURLSubstrings;
}
public void setBoringURLSubstrings(List boringURLSubstrings)
{
this.boringURLSubstrings = boringURLSubstrings;
}
public boolean isInteresting(URL u)
{
return matchURL(u, interestingURLSubstrings);
}
public boolean isBoring(URL u)
{
return matchURL(u, boringURLSubstrings);
}
private boolean matchURL(URL u, List substrings)
{
String str = u.toExternalForm();
for(Iterator i = substrings.iterator(); i.hasNext(); )
{
String substr = (String) i.next();
if(str.indexOf(substr) != -1)
{
return true;
}
}
return false;
}
public void setDepthFirstSearch(boolean depthFirst)
{
this.depthFirst = depthFirst;
}
public boolean isDepthFirstSearch()
{
return depthFirst;
}
public void setMaxDepth(int maxDepth)
{
this.maxDepth = maxDepth;
}
public int getMaxDepth()
{
return maxDepth;
}
public void setUserAgent(String userAgent)
{
this.userAgent = userAgent;
}
public String getUserAgent()
{
return userAgent;
}
public void setBasicAuthUser(String basicAuthUser)
{
this.basicAuthUser = basicAuthUser;
}
public String getBasicAuthUser()
{
return basicAuthUser;
}
public void setBasicAuthPassword(String basicAuthPassword)
{
this.basicAuthPassword = basicAuthPassword;
}
public String getBasicAuthPassword()
{
return basicAuthPassword;
}
public void setSpiderThreads(int spiderThreads)
{
this.spiderThreads = spiderThreads;
}
public int getSpiderThreads()
{
return spiderThreads;
}
public void setCheckpointInterval(long interval)
{
this.checkpointInterval = interval;
}
public long getCheckpointInterval()
{
return checkpointInterval;
}
public String toString()
{
return "depthFirst:\t" + depthFirst
+ "\nmaxDepth:\t" + maxDepth
+ "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
+ "\nimageExtensions:\t" + fromSet(imageExtensions)
+ "\nrefreshHTMLs:\t" + refreshHTMLs
+ "\nrefreshImages:\t" + refreshImages
+ "\nrefreshOthers:\t" + refreshOthers
+ "\nsaveRootDirectory:\t" + saveRootDirectory
+ "\nstartLocation:\t" + startLocation
+ "\nurlMatch:\t" + urlMatch
+ "\nuserAgent:\t" + userAgent
+ "\nbasicAuthUser:\t" + basicAuthUser
+ "\nbasicAuthPassword:\t" + "***"
+ "\nspiderThreads:\t" + spiderThreads
+ "\ncheckpointInterval:\t" + checkpointInterval;
}
private Set parseSet(String str)
{
LechLogger.debug("parseSet(" + str + ")");
HashSet result = new HashSet();
StringTokenizer sTok = new StringTokenizer(str, ",");
while(sTok.hasMoreTokens())
{
String tok = sTok.nextToken().trim();
result.add(tok);
}
return result;
}
private String fromSet(Set s)
{
StringBuffer sb = new StringBuffer();
boolean first = true;
for(Iterator i = s.iterator(); i.hasNext(); )
{
String str = (String) i.next();
if(first)
{
first = false;
}
else
{
sb.append(",");
}
sb.append(str);
}
return sb.toString();
}
} // End class SpiderConfig

138
weblech/spider/URLGetter.java Executable file
View File

@ -0,0 +1,138 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.Authenticator;
import java.io.*;
import weblech.ui.LechLogger;
public class URLGetter
{
private int failureCount = 0;
private final SpiderConfig config;
public URLGetter(SpiderConfig config)
{
LechLogger.debug("URLGetter()");
this.config = config;
Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
}
public URLObject getURL(URLToDownload url)
{
LechLogger.debug("getURL(" + url + ")");
if(failureCount > 10)
{
LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download");
try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { };
failureCount = 0;
}
URL requestedURL = url.getURL();
URL referer = url.getReferer();
try
{
LechLogger.debug("Creating HTTP connection to " + requestedURL);
HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection();
if(referer != null)
{
LechLogger.debug("Setting Referer header to " + referer);
conn.setRequestProperty("Referer", referer.toExternalForm());
}
if(config.getUserAgent() != null)
{
LechLogger.debug("Setting User-Agent to " + config.getUserAgent());
conn.setRequestProperty("User-Agent", config.getUserAgent());
}
conn.setUseCaches(false);
LechLogger.debug("Opening URL");
long startTime = System.currentTimeMillis();
conn.connect();
String resp = conn.getResponseMessage();
LechLogger.debug("Remote server response: " + resp);
String respStr = conn.getHeaderField(0);
LechLogger.info("Server response: " + respStr);
for(int i = 1; ; i++)
{
String key = conn.getHeaderFieldKey(i);
if(key == null)
{
break;
}
String value = conn.getHeaderField(key);
LechLogger.debug("Received header " + key + ": " + value);
}
LechLogger.debug("Getting buffered input stream from remote connection");
BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
byte[] buf = new byte[1024];
int bytesRead = 0;
while(bytesRead >= 0)
{
baos.write(buf, 0, bytesRead);
bytesRead = remoteBIS.read(buf);
}
byte[] content = baos.toByteArray();
long timeTaken = System.currentTimeMillis() - startTime;
if(timeTaken < 100) timeTaken = 500;
int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
if(content.length < conn.getContentLength())
{
LechLogger.warn("Didn't download full content for URL: " + url);
failureCount++;
return null;
}
return new URLObject(requestedURL, conn.getContentType(), content, config);
}
catch(FileNotFoundException fnfe) {
LechLogger.warn("File not found: " + fnfe.getMessage());
return null;
}
catch(IOException ioe)
{
LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
failureCount++;
return null;
}
}
}

206
weblech/spider/URLObject.java Executable file
View File

@ -0,0 +1,206 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import java.io.*;
import java.net.URL;
import java.net.URLEncoder;
import weblech.ui.LechLogger;
public class URLObject
{
private final URL sourceURL;
private final String contentType;
private final byte[] content;
private final SpiderConfig config;
public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config)
{
this.sourceURL = sourceURL;
this.contentType = contentType;
this.content = content;
this.config = config;
}
public URLObject(URL sourceURL, SpiderConfig config)
{
this.sourceURL = sourceURL;
this.config = config;
String s = sourceURL.toExternalForm().toLowerCase();
if(s.indexOf(".jpg") != -1)
{
contentType = "image/jpeg";
}
else if(s.indexOf(".gif") != -1)
{
contentType = "image/gif";
}
else
{
contentType = "text/html";
}
if(existsOnDisk())
{
File f = new File(convertToFileName());
if(f.isDirectory())
{
f = new File(f, "index.html");
}
content = new byte[(int) f.length()];
try
{
FileInputStream in = new FileInputStream(f);
in.read(content);
in.close();
}
catch(IOException ioe)
{
LechLogger.warn("IO Exception reading disk version of URL " + sourceURL, ioe);
}
}
else
{
content = new byte[0];
}
}
public String getContentType()
{
return contentType;
}
public boolean isHTML()
{
return contentType.toLowerCase().startsWith("text/html");
}
public boolean isXML()
{
return contentType.toLowerCase().startsWith("text/xml");
}
public boolean isImage()
{
return contentType.startsWith("image/");
}
public String getStringContent()
{
return new String(content);
}
private String convertToFileName()
{
String url = sourceURL.toExternalForm();
int httpIdx = url.indexOf("http://");
if(httpIdx == 0)
{
url = url.substring(7);
}
// Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
if(url.indexOf("/") < 0)
{
url = url + "/";
}
// If trailing slash, add index.html as default
if(url.endsWith("/"))
{
url = url + "index.html";
}
try {
/* the old encode method is now deprecated, updated to the new API -- Coleman */
url = textReplace("?", URLEncoder.encode("?","UTF-8"), url);
url = textReplace("&", URLEncoder.encode("&","UTF-8"), url);
}
catch ( java.io.UnsupportedEncodingException exception ) {
LechLogger.error ( exception.toString() );
}
return config.getSaveRootDirectory().getPath() + "/" + url;
}
public boolean existsOnDisk()
{
File f = new File(convertToFileName());
return (f.exists() && !f.isDirectory());
}
public void writeToFile()
{
writeToFile(convertToFileName());
}
public void writeToFile(String fileName)
{
LechLogger.debug("writeToFile(" + fileName + ")");
try
{
File f = new File(fileName);
f.getParentFile().mkdirs();
FileOutputStream out = new FileOutputStream(fileName);
out.write(content);
out.flush();
out.close();
}
catch(IOException ioe)
{
LechLogger.warn("IO Exception writing to " + fileName, ioe);
}
}
public String toString()
{
StringBuffer sb = new StringBuffer();
sb.append("URLObject: ");
sb.append(contentType);
if(false)//isHTML() || isXML())
{
sb.append("\n");
sb.append(getStringContent());
}
return sb.toString();
}
private String textReplace(String find, String replace, String input)
{
int startPos = 0;
while(true)
{
int textPos = input.indexOf(find, startPos);
if(textPos < 0)
{
break;
}
input = input.substring(0, textPos) + replace + input.substring(textPos + find.length());
startPos = textPos + replace.length();
}
return input;
}
}

View File

@ -0,0 +1,68 @@
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/URLToDownload.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
package weblech.spider;
import java.net.URL;
public class URLToDownload implements java.io.Serializable
{
private final URL url;
private final URL referer;
private final int depth;
public URLToDownload(URL url, int depth)
{
this(url, null, depth);
}
public URLToDownload(URL url, URL referer, int depth)
{
this.url = url;
this.referer = referer;
this.depth = depth;
}
public URL getURL()
{
return url;
}
public URL getReferer()
{
return referer;
}
public int getDepth()
{
return depth;
}
public String toString()
{
return url + ", referer " + referer + ", depth " + depth;
}
}

138
weblech/ui/LechLogger.java Executable file
View File

@ -0,0 +1,138 @@
/**
* LechLogger.java: A Graphical Logger
* The original weblech had a simple text interface and used an apache.org library for logging.
* When i wanted to make this thing into a graphical appk, I realized the text logging
* was going to have to go. It assumes you only want to log to one location (not a problem for
* this application).
*/
package weblech.ui;
import javax.swing.JTextArea;
import java.io.IOException;
public class LechLogger {
/**
* The actual text area that will perform all output.
*/
private static JTextArea _loggerWindow;
/**
* These are flags for enabling different types of logging mechanisms.
*/
private static boolean error_enable, warn_enable, info_enable, debug_enable;
/**
* Only need one initialization since this will be shared between many different
* objects.
*/
static {
_loggerWindow = null;
error_enable = true;
warn_enable = true;
info_enable = true;
debug_enable = true;
}
/** Everybody wants to log, but you only need one logger! */
public LechLogger() {
}
/**
* Sets the textual component to perform the logging.
*/
public static void setTextArea ( JTextArea textarea ) {
_loggerWindow = textarea;
}
/**
* A private method for actually writing the messages.
* It is synchronized because the weblech spider is multi
* threaded.
*/
private static synchronized void log ( String msg ) {
if ( _loggerWindow == null ) {
//System.out.println ( msg );
return;
}
_loggerWindow.setEditable ( true );
_loggerWindow.append ( msg );
_loggerWindow.append ( "\n" );
_loggerWindow.setEditable ( false );
}
/**
* Toggle error logging.
*/
public static void setErrorLogging() {
error_enable = !error_enable;
}
/**
* Log an error message.
*/
public static void error ( String msg ) {
if ( !error_enable ) return;
log ( "*error>" + msg );
}
/**
* Log an error message and an exception.
*/
public static void error ( String msg, Exception exception ) {
if ( !error_enable ) return;
log ( "*error>" + msg + "\n" + exception.getMessage() );
}
/**
* Toggle informational messages.
*/
public static void setInformationalLogging() {
info_enable = !info_enable;
}
/**
* Log an informational message.
*/
public static void info ( String msg ) {
if ( !info_enable ) return;
log ( "^info>" + msg );
}
/**
* Toggle warning messages.
*/
public static void setWarningLogging() {
warn_enable = !warn_enable;
}
/**
* Log a warning message.
*/
public static void warn ( String msg ) {
if ( !warn_enable ) return;
log ( "-warn>" + msg );
}
/**
* Log a warning message with an exception.
*/
public static void warn ( String msg, IOException exception ) {
if ( !warn_enable ) return;
log ( "-warn>" + msg + "\n" + exception.getMessage() );
}
/**
* Toggle debug messages to be printed.
*/
public static void setDebugLogging() {
debug_enable = !debug_enable;
}
/**
* Log a deubgging statement to the logging text area.
*/
public static void debug ( String msg ) {
if ( !debug_enable ) return;
log ( "@debug>" + msg );
}
}

251
weblech/ui/SpiderConfigPanel.java Executable file
View File

@ -0,0 +1,251 @@
/**
* SpiderConfigPanel.java: A graphcial panel for configuring a SpiderConfig object.
* This panel provides a way to change the more practical options of the WebLech Spider.
* It supports saving and opening of SpiderConfigurations from a file. It does not use
* any of the "interesting" or "boring" url features, or the email link save file.
*/
package weblech.ui;
import weblech.spider.Spider;
import weblech.spider.SpiderConfig;
import javax.swing.JPanel;
import javax.swing.JTextField;
import javax.swing.JLabel;
import javax.swing.JButton;
import javax.swing.JComboBox;
import javax.swing.JFileChooser;
import java.awt.GridLayout;
import java.awt.FlowLayout;
import java.awt.event.ActionListener;
import java.awt.event.ActionEvent;
import java.util.Properties;
import java.util.ArrayList;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
public class SpiderConfigPanel extends JPanel implements ActionListener {
/**
* A list of all of the spiders that the GUI will have downloading. It is assumed
* that the user knows how much bandwidth you have and really wants to try and get
* several different sites at the same time.
*/
private ArrayList spiders;
/**
* Various text fields for the configuration options.
*/
private JTextField sitenamefield, dirfield, usernamefield, passwordfield, agentfield, depthfield, matchfield;
/**
* A selection box for the number of threads a new Spider should use, I am limiting
* the number of threads to 4 for simplicity.
*/
private JComboBox threadbox;
public SpiderConfigPanel() {
super ( new GridLayout ( 8, 1 ) );
spiders = new ArrayList();
/* Panel for the directory to save all files */
JPanel sitepanel = new JPanel();
((FlowLayout) sitepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel sitelabel = new JLabel ( "Output Directory:" );
dirfield = new JTextField ( System.getProperty ( "user.home" ), 20 );
sitepanel.add ( sitelabel );
sitepanel.add ( dirfield );
/* Panel for the site to download */
JPanel outputpanel = new JPanel();
((FlowLayout) outputpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel dirlabel = new JLabel ( "Download Site:" );
sitenamefield = new JTextField ( "http://www.google.com/", 20 );
outputpanel.add ( dirlabel );
outputpanel.add ( sitenamefield );
/* Panel for the HTTP username */
JPanel usernamepanel = new JPanel();
((FlowLayout) usernamepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel usernamelabel = new JLabel ( "Username:" );
usernamefield = new JTextField ( "", 20 );
usernamepanel.add ( usernamelabel );
usernamepanel.add ( usernamefield );
/* Panel for the HTTP password */
JPanel passpanel = new JPanel();
((FlowLayout) passpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel passwdlabel = new JLabel ( "Password:" );
passwordfield = new JTextField ( "", 20 );
passpanel.add ( passwdlabel );
passpanel.add ( passwordfield );
/* Panel for the HTTP user agent */
JPanel agentpanel = new JPanel();
((FlowLayout) agentpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel agentlabel = new JLabel ( "User Agent:" );
agentfield = new JTextField ( "WebLech [Version C]", 20 );
agentpanel.add ( agentlabel );
agentpanel.add ( agentfield );
/* Panel for a simple string match downloading limiter (no match, no download) */
JPanel matchpanel = new JPanel();
((FlowLayout) matchpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel matchlabel = new JLabel ( "Match String:" );
matchfield = new JTextField ( "", 20 );
matchpanel.add ( matchlabel );
matchpanel.add ( matchfield );
/* Provides a panel for placing both the maximum depth and threads for this spider */
JPanel detailpanel = new JPanel ( new GridLayout ( 1, 2 ) );
JPanel depthpanel = new JPanel();
((FlowLayout) depthpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
JLabel depthlabel = new JLabel ( "Max Depth:" );
depthfield = new JTextField ( Integer.toString ( 0 ), 5 );
depthpanel.add ( depthlabel );
depthpanel.add ( depthfield );
JPanel threadpanel = new JPanel();
JLabel threadlabel = new JLabel ( "Spider Threads:" );
Integer[] threaditems = { new Integer ( "1" ), new Integer ( "2" ), new Integer ( "3" ), new Integer ( "4" ) };
threadbox = new JComboBox ( threaditems );
threadpanel.add ( threadlabel );
threadpanel.add ( threadbox );
detailpanel.add ( depthpanel );
detailpanel.add ( threadpanel );
/* Panel of buttons for various operations */
JPanel buttonpanel = new JPanel();
JButton save = new JButton ( "Save" );
JButton spiderbutton = new JButton ( "Spider It" );
JButton open = new JButton ( "Open" );
JButton qbutton = new JButton ( "Quit" );
buttonpanel.add ( save );
buttonpanel.add ( spiderbutton );
buttonpanel.add ( open );
buttonpanel.add ( qbutton );
add ( sitepanel );
add ( outputpanel );
add ( usernamepanel );
add ( passpanel );
add ( agentpanel );
add ( matchpanel );
add ( detailpanel );
add ( buttonpanel );
/* Configure the button actions */
save.setActionCommand ( "save" );
open.setActionCommand ( "open" );
spiderbutton.setActionCommand ( "spider" );
qbutton.setActionCommand ( "quit" );
save.addActionListener ( this );
open.addActionListener ( this );
spiderbutton.addActionListener ( this );
qbutton.addActionListener ( this );
LechLogger.debug ( "Actions" );
}
/**
* This method will create a Properties object good for instantiating a new SpiderConfig
* Object.
*/
private Properties createProperties() {
Properties p = new Properties();
p.setProperty ( "saveRootDirectory", dirfield.getText() );
p.setProperty ( "startLocation", sitenamefield.getText() );
p.setProperty ( "basicAuthUser", usernamefield.getText() );
p.setProperty ( "basicAuthPassword", passwordfield.getText() );
p.setProperty ( "urlMatch", matchfield.getText() );
p.setProperty ( "spiderThreads", ((Integer) threadbox.getSelectedItem()).toString() );
p.setProperty ( "maxDepth", depthfield.getText() );
p.setProperty ( "userAgent", agentfield.getText() );
p.setProperty ( "interestingURLs", "" );
return p;
}
/**
* This method will extract all of the values from a SpiderConfig object that the GUI uses
* and updates the panel to show the values in the object.
*/
private void setSpiderConfig ( SpiderConfig sc ) {
dirfield.setText ( sc.getSaveRootDirectory().toString() );
sitenamefield.setText ( sc.getStartLocation().toString() );
usernamefield.setText ( sc.getBasicAuthUser() );
passwordfield.setText ( sc.getBasicAuthPassword() );
matchfield.setText ( sc.getURLMatch() );
int t = sc.getSpiderThreads();
if ( t < 1 || t > 4 ) {
t = 1;
sc.setSpiderThreads ( t );
}
threadbox.setSelectedIndex ( t - 1 );
depthfield.setText ( Integer.toString ( sc.getMaxDepth() ) );
agentfield.setText ( sc.getUserAgent() );
}
/**
* This method will coordinate all of the actions for the various buttons used.
*/
public void actionPerformed ( ActionEvent event ) {
String cmd = event.getActionCommand();
/* Download a new site */
if ( cmd.equals ( "spider" ) ) {
SpiderConfig c = new SpiderConfig ( createProperties() );
Spider spider = new Spider ( c );
/* But only if we are not already downloading the site */
if ( spiders.contains ( spider ) ) {
LechLogger.warn ( "Already have an instance of a Spider at " + c.getStartLocation() );
return;
}
spiders.add ( spider );
spider.start();
}
/* Save the current configuration to a file */
else if ( cmd.equals ( "save" ) ) {
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
int r = f.showSaveDialog ( this );
if ( r != JFileChooser.APPROVE_OPTION ) return;
File outfile = f.getSelectedFile();
try {
ObjectOutputStream os = new ObjectOutputStream ( new FileOutputStream ( outfile ) );
os.writeObject ( new SpiderConfig ( createProperties() ) );
os.close();
}
catch ( Exception exception ) {
LechLogger.error ( exception.toString() );
}
}
/* Open a saved configuration from a file */
else if ( cmd.equals ( "open" ) ) {
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
int r = f.showOpenDialog ( this );
if ( r != JFileChooser.APPROVE_OPTION ) return;
File infile = f.getSelectedFile();
if ( !infile.canRead() ) {
LechLogger.error ( "file " + f.toString() + " is not readable" );
return;
}
try {
ObjectInputStream os = new ObjectInputStream ( new FileInputStream ( infile ) );
SpiderConfig sc = (SpiderConfig) os.readObject();
os.close();
setSpiderConfig ( sc );
}
catch ( Exception exception ) {
LechLogger.error ( exception.toString() );
}
}
/* Just quit */
else if ( cmd.equals ( "quit" ) ) {
System.exit ( 0 );
}
}
}

113
weblech/ui/Troll.java Executable file
View File

@ -0,0 +1,113 @@
/**
* Troll.java: A user interface to the weblech spider download utility.
*/
package weblech.ui;
/* I like to explicitly import all of my packages to remind me to KISS */
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JTabbedPane;
import javax.swing.JTextArea;
import javax.swing.JScrollPane;
import javax.swing.JMenuBar;
import javax.swing.JMenu;
import javax.swing.JMenuItem;
import javax.swing.JCheckBoxMenuItem;
import java.awt.Dimension;
import java.awt.event.ActionListener;
import java.awt.event.ActionEvent;
public class Troll extends JFrame implements ActionListener {
/**
* This SpiderConfigPanel is a custom panel that provides many of the more
* practical features of the weblech spider. It also controls the spiders
* created by the user.
*/
private SpiderConfigPanel configpanel;
/**
* This is the area that all of the logging facilities will use. This makes debugging
* in a system like Max OS X much simpler (:^)
*/
private static JTextArea logarea;
/* This just initializes the logging text box and readies it for recording events before
* the rest of the object is even loaded.
*/
static {
logarea = new JTextArea();
LechLogger.setTextArea ( logarea );
LechLogger.setDebugLogging();
}
Troll() {
super ( "Troll" );
Dimension initialsize = new Dimension ( 400, 375 );
setSize ( initialsize);
/* Create a menubar for controlling which aspects of the log you wish to see */
JMenuBar menubar = new JMenuBar();
JMenu logmenu = new JMenu ( "Log Options" );
JCheckBoxMenuItem showdebug = new JCheckBoxMenuItem ( "Show Debug Messages", false );
showdebug.setActionCommand ( "debug" );
showdebug.addActionListener ( this );
JCheckBoxMenuItem showinfo = new JCheckBoxMenuItem ( "Show Informational Messages", true );
showinfo.setActionCommand ( "info" );
showinfo.addActionListener ( this );
JCheckBoxMenuItem showwarn = new JCheckBoxMenuItem ( "Show Warnings", true );
showwarn.setActionCommand ( "warn" );
showwarn.addActionListener ( this );
JCheckBoxMenuItem showerror = new JCheckBoxMenuItem ( "Show Errors", true );
showerror.setActionCommand ( "error" );
showerror.addActionListener ( this );
logmenu.add ( showdebug );
logmenu.add ( showinfo );
logmenu.add ( showwarn );
logmenu.add ( showerror );
menubar.add ( logmenu );
/* A simple tab interface between configuration and error checking */
configpanel = new SpiderConfigPanel();
JPanel logpanel = new JPanel();
logpanel.add ( logarea );
JScrollPane logscroller = new JScrollPane ( logpanel );
JTabbedPane tabs = new JTabbedPane();
tabs.addTab ( "Spider", configpanel );
tabs.addTab ( "Log", logscroller );
/* Configure the JFrame to a usable state */
setJMenuBar ( menubar );
getContentPane().add ( tabs );
setLocationRelativeTo ( null );
setVisible ( true );
setDefaultCloseOperation ( JFrame.EXIT_ON_CLOSE );
}
/**
* This method basically toggles all of the logging options.
*/
public void actionPerformed ( ActionEvent event ) {
String cmd = event.getActionCommand();
if ( cmd.equals ( "debug" ) ) {
LechLogger.setDebugLogging();
}
else if ( cmd.equals ( "info" ) ) {
LechLogger.setInformationalLogging();
}
else if ( cmd.equals ( "warn" ) ) {
LechLogger.setDebugLogging();
}
else if ( cmd.equals ( "error" ) ) {
LechLogger.setErrorLogging();
}
}
/**
* Create a new troll and go
*/
public static void main ( String[] args ) {
Troll t = new Troll();
}
}