Re: Threads in Web Crawler
Quote:
its not letting me click the stop button
That sounds like you are using the Swing EDT thread and not letting Swing handle the GUI on that thread. All your threads should be in the background, leaving the GUI thread to Swing.
Also you should use the SwingUtilities methods like invokeLater() when you are updating Swing components. That will put the updating code on the Swing EDT.
Re: Threads in Web Crawler
wait, im using a regular Thread... the one in java.lang... and if you all have four workers running, you have only five threads running: one for each worker and then the updater thread... im not exactly following what your saying... the main thread sets up the gui, and the CrawlManager, which then that sets up Threads for the Updater and each worker you have... the main thread is then just pretty much there so you can press the start and stop button, because updater refreshed the graphics... and the updater and crawler threads ONLY start when pressing, well, start. same thing with stop, you have to press stop to quit those threads... you just cant exit the program or else you have six javaw processes hanging in the background.... a little note about myself, im the person who figured out that "hey, this code allows me to do this!" because ive been searching the java 6 api and going between different websites and making a mental map of how this code works, but i really, in alot of cases, especially with Swing, dont know how that really works... whats going on to make that code do what it does... which is why im kinda bad at Swing, and now to the main point... the way i program would seem to alot of programmers, i would say the correct word would be archaeic. to alot of programmers the way i do it, the way i understand it, seems to be the long way... just throwing that out there... lol
Re: Threads in Web Crawler
You might have missed this. I added it after the first bit:
Also you should use the SwingUtilities methods like invokeLater() when you are updating Swing components. That will put the updating code on the Swing EDT. Swing is not thread save.
For example the following is updating a Swing component. It should be in a SwingUtilities invoke...() method:
Re: Threads in Web Crawler
Quote:
For example the following is updating a Swing component.
Actually, JTextArea.append is thread safe ;) but this is definitely good advice to follow.
I'm not sure I'm following this thread, as it does not have much code associated with it. But if you are pushing your machine to 90%, then even if you have threaded your application correctly you should expect some slowing down because your machine won't be able to catch up - especially if you are accumulating string's which require more and more memory to store you then run into a wall of memory and processors.
BTW, if you get exceptions, it helps to post the full stack trace.
Re: Threads in Web Crawler
well, i have one, taken care of that memory problem, at least with the "logs", by clearing each string out before it returns the data... but just to resolve any discrepancies... im posting the full code... its only four classes... shouldnt take much... feel free to look and scavenge thru it... heck, i didnt even make the crawler method, i copied it from something i found off of suns website, then built the rest of the program around it... lol.
Code :
import java.awt.event.ActionListener;
import java.awt.event.ActionEvent;
import javax.swing.text.DefaultCaret;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JButton;
import javax.swing.JLabel;
import java.awt.Dimension;
import java.awt.Toolkit;
import java.util.ArrayList;
import java.io.PrintWriter;
import java.io.FileOutputStream;
public class WebCrawler implements ActionListener
{
JFrame frame;
JPanel panel;
JTextArea input,found;
JTextArea[] outputs;
String[] starts;
CrawlManager cm;
public void start()
{
String text = input.getText();
starts = text.split("\n");
cm = new CrawlManager(starts,outputs);
cm.start();
}
public void stop()
{
cm.stop();
ArrayList urls = cm.getUrls();
try
{
PrintWriter pw = new PrintWriter(new FileOutputStream("F:\\results.txt"));
for(int x = 0;x<urls.size();x++)
{
pw.println((String)urls.get(x));
}
pw.flush();
pw.close();
}
catch(Exception ex)
{
}
}
public void actionPerformed(ActionEvent e)
{
String a = e.getActionCommand();
if(a=="Start")
{
start();
}
else if(a=="Stop")
{
stop();
}
}
public static void main (String argv[])
{
new WebCrawler().go();
}
public void go()
{
frame = new JFrame();
frame.setTitle("Sean's Crawler");
Dimension dim = Toolkit.getDefaultToolkit().getScreenSize();
frame.setBounds(0,0,dim.width,dim.height);
panel = new JPanel();
panel.setLayout(null);
input = new JTextArea();
input.setBounds(0,0,frame.getWidth()-500,frame.getHeight()/2-100);
panel.add(input);
found = new JTextArea();
DefaultCaret caret = (DefaultCaret)found.getCaret();
caret.setUpdatePolicy(DefaultCaret.ALWAYS_UPDATE);
found.setLineWrap(true);
JScrollPane jsp = new JScrollPane(found);
jsp.setBounds(frame.getWidth()-500,50,200,frame.getHeight()/2-100);
jsp.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS);
found.setEditable(false);
panel.add(jsp);
JLabel l = new JLabel("URL's Found: ");
l.setBounds(frame.getWidth()-500,0,200,50);
panel.add(l);
outputs = new JTextArea[5];
outputs = createOutputs();
outputs[4] = found;
JButton start = new JButton("Start");
start.addActionListener(this);
start.setBounds(frame.getWidth()-300,0,300,frame.getHeight()/2-5);
panel.add(start);
JButton stop = new JButton("Stop");
stop.setBounds(frame.getWidth()-250,frame.getHeight()/2+5,250,frame.getHeight()/2-5);
stop.addActionListener(this);
panel.add(stop);
frame.getContentPane().add(panel);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
frame.setVisible(true);
}
public JTextArea[] createOutputs()
{
for(int x = 0;x<4;x++)
{
JTextArea output = new JTextArea();
DefaultCaret caret = (DefaultCaret)output.getCaret();
caret.setUpdatePolicy(DefaultCaret.ALWAYS_UPDATE);
output.setLineWrap(true);
JScrollPane jsp = new JScrollPane(output);
jsp.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS);
output.setEditable(false);
jsp.setBounds(x*200,frame.getHeight()/2+5,200,frame.getHeight()/2-5);
JLabel l = new JLabel("Worker : "+x);
l.setBounds(x*200,frame.getHeight()/2-50,100,100);
panel.add(l);
panel.add(jsp);
outputs[x] = output;
}
return outputs;
}
}
Code :
import javax.swing.JTextArea;
import java.util.ArrayList;
public class CrawlManager
{
String[] starts;
Crawler[] crawlers;
Updater upd;
ArrayList urls;
public CrawlManager(String[] starts,JTextArea[] outputs)
{
this.starts = starts;
urls = new ArrayList();
switch(starts.length)
{
case 1:
{
crawlers = new Crawler[1];
crawlers[0] = new Crawler(starts[0]);
break;
}
case 2:
{
crawlers = new Crawler[2];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
break;
}
case 3:
{
crawlers = new Crawler[3];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
crawlers[2] = new Crawler(starts[2]);
break;
}
case 4:
{
crawlers = new Crawler[4];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
crawlers[2] = new Crawler(starts[2]);
crawlers[3] = new Crawler(starts[3]);
break;
}
default:
{
throw new RuntimeException("You have submitted a wrong value!");
}
}
upd = new Updater(crawlers,outputs,this);
}
public ArrayList getUrls()
{
return urls;
}
public void start()
{
if(crawlers!=null)
{
for(int x = 0;x<starts.length;x++)
{
crawlers[x].start();
}
}
upd.start();
}
public void stop()
{
upd.stop();
for(int x = 0;x<starts.length;x++)
{
ArrayList s = crawlers[x].searched;
for(int d = 0;d<s.size();d++)
{
urls.add(s.get(d));
}
crawlers[x] = null;
}
}
}
Code :
import java.util.*;
import java.net.*;
import java.io.*;
public class Crawler extends Thread
{
ArrayList toSearch,searched;
String start;
String log;
public Crawler(String start)
{
this.start = start;
toSearch = new ArrayList();
searched = new ArrayList();
}
public String getStatus()
{
String j = log;
log = new String();
return j;
}
private void setStatus(String text)
{
log+=text;
}
boolean robotSafe(URL url)
{
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try
{
urlRobot = new URL(strRobot);
}
catch (MalformedURLException e)
{
return false;
}
String strCommands;
try
{
InputStream urlRobotStream = urlRobot.openStream();
strCommands = new String();
Scanner f = new Scanner(urlRobotStream);
while(f.hasNext())
{
strCommands+=f.next();
}
urlRobotStream.close();
f.close();
}
catch (IOException e)
{
return true;
}
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf("Disallow:", index)) != -1)
{
index += "Disallow:".length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
if (strURL.indexOf(strBadPath) == 0)
return false;
}
return true;
}
public void run()
{
String strURL = start;
toSearch.clear();
searched.clear();
toSearch.add(strURL);
while (toSearch.size() > 0)
{
strURL = (String) toSearch.get(0);
setStatus("searching " + strURL);
URL url;
try
{
url = new URL(strURL);
}
catch (MalformedURLException e)
{
setStatus("ERROR: invalid URL at line 92" + strURL);
break;
}
toSearch.remove(0);
searched.add(strURL);
if (url.getProtocol().compareTo("http") != 0)
{
setStatus("Break at line 99");
break;
}
if (!robotSafe(url))
{
setStatus("Not Robot safe!");
break;
}
try
{
URLConnection urlConnection = url.openConnection();
setStatus("Opening URL...");
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String content = new String();
Scanner f = new Scanner(urlStream);
setStatus("Gathering data...");
while(f.hasNext())
{
content+=f.nextLine();
}
urlStream.close();
f.close();
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
{
setStatus("Break at line 128");
break;
}
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
{
setStatus("Break at line 133");
break;
}
setStatus("Finding links...");
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try
{
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
}
catch (MalformedURLException e)
{
setStatus("ERROR: bad URL at line 149" + strLink);
continue;
}
if (urlLink.getProtocol().compareTo("http") != 0)
{
setStatus("Break at line 154");
break;
}
if ((!searched.contains(strLink)) && (!toSearch.contains(strLink)))
{
if (robotSafe(urlLink))
{
toSearch.add(strLink);
setStatus("Added "+strLink);
}
}
}
}
catch (IOException e)
{
setStatus("ERROR: couldn't open URL at line 139" + strURL);
break;
}
}
}
}
Code :
import javax.swing.JTextArea;
import java.util.ArrayList;
public class Updater implements Runnable
{
Crawler[] crawlers;
boolean stop;
Thread t;
JTextArea[] outputs;
CrawlManager cm;
ArrayList posted;
public Updater(Crawler[] crawlers,JTextArea[] outputs,CrawlManager cm)
{
this.crawlers = crawlers;
stop = false;
this.outputs = outputs;
posted = new ArrayList();
this.cm = cm;
}
public void start()
{
t = new Thread(this);
t.start();
}
public void stop()
{
stop = true;
if(t!=null)
{
t = null;
}
for(int x = 0;x<outputs.length;x++)
{
outputs[x].setText(null);
outputs[x].append("Stopped!");
}
}
public void run()
{
while(stop==false)
{
for(int x = 0;x<crawlers.length;x++)
{
ArrayList urls = crawlers[x].searched;
for(int f = 0;f<urls.size();f++)
{
String a = (String)urls.get(f);
if(posted.contains(a)==false)
{
outputs[4].append(a);
posted.add(a);
}
}
}
for(int x = 0;x<crawlers.length;x++)
{
String text = crawlers[x].getStatus();
outputs[x].append(text);
try
{
crawlers[x].sleep(100);
}
catch(Exception ex)
{
}
}
}
}
}