Perl PasteBin scraper
up vote
3
down vote
favorite
I'd like to optimize my code and get a better of understanding of how I can perform the task I am doing better. I've only used Perl threads about 3 or 4 times now.
The purpose of my script / code block is to scrape the Pastebin API for every new public paste that appears at regular intervals using a crontab. Then search inside those new pastes for user defined regular expressions such as "google.com." The paste is then saved to a sqlite DB for later viewing and a notification is sent to a listening nodejs bot.
That being said here is my code block I wish to have improved upon:
sub threadCheckKey {
my ($url, $key, @regexs) = @_;
my $fullURL = $url.$key;
my @flaggedRegex = ();
my $date = strftime "%D", localtime;
my @data = ();
my $thread = threads->create(sub {
my $dbConnection = openDB();
open(GET_DATA, "-|", "curl -s " . $fullURL . " -k") or die("$!");
open(WRITE_FILE, ">", $key . ".txt") or die("$!");
while(my $line = <GET_DATA>) {
print WRITE_FILE $line;
foreach my $regex(@regexs) {
if($line =~ m/$regex/) {
if(!(grep(/$regex/, @flaggedRegex))) {
push(@flaggedRegex, $regex);
}
}
}
}
close(WRITE_FILE);
close(GET_DATA);
my $flaggedSize = @flaggedRegex;
if($flaggedSize == 0) {
#Uncomment below for debugging.
#print "No user defined regex foundn";
return;
}
open(READ_FILE, "<", $key . ".txt") or die("$!");
while(my $line = <READ_FILE>) {
push(@data, $line);
}
close(READ_FILE);
my $inputData = join("r", @data);
my $flaggedReg = join(" | ", @flaggedRegex);
my $updateRow = qq(UPDATE $tables[0] set data = ?, date = ?, regex = ? where pastekey = ?);
my $updateRowPrepare = $dbConnection->prepare($updateRow);
my $executeRowUpdate = $updateRowPrepare->execute($inputData, $date, $flaggedReg, $key);
if($executeRowUpdate < 0) {
print $DBI::errstr;
}
my $msg = qq(**Alert:** New paste found! The paste can be viewed via https://pastebin.com/$key - Regex Trigger: $flaggedReg);
open(SEND_ALERT, "-|", "curl https://localhost:8051/bots/bot_service_id/alert -k --data '{"text":"$msg"}' -H "Content-Type: application/json" -X POST") or die("$!");
close(SEND_ALERT);
$dbConnection->disconnect();
});
if($thread->is_running()) {
sleep(2);
}
if($thread->is_joinable()) {
$thread->join();
}
unlink $key . ".txt";
return;
}
multithreading io perl sqlite curl
New contributor
add a comment |
up vote
3
down vote
favorite
I'd like to optimize my code and get a better of understanding of how I can perform the task I am doing better. I've only used Perl threads about 3 or 4 times now.
The purpose of my script / code block is to scrape the Pastebin API for every new public paste that appears at regular intervals using a crontab. Then search inside those new pastes for user defined regular expressions such as "google.com." The paste is then saved to a sqlite DB for later viewing and a notification is sent to a listening nodejs bot.
That being said here is my code block I wish to have improved upon:
sub threadCheckKey {
my ($url, $key, @regexs) = @_;
my $fullURL = $url.$key;
my @flaggedRegex = ();
my $date = strftime "%D", localtime;
my @data = ();
my $thread = threads->create(sub {
my $dbConnection = openDB();
open(GET_DATA, "-|", "curl -s " . $fullURL . " -k") or die("$!");
open(WRITE_FILE, ">", $key . ".txt") or die("$!");
while(my $line = <GET_DATA>) {
print WRITE_FILE $line;
foreach my $regex(@regexs) {
if($line =~ m/$regex/) {
if(!(grep(/$regex/, @flaggedRegex))) {
push(@flaggedRegex, $regex);
}
}
}
}
close(WRITE_FILE);
close(GET_DATA);
my $flaggedSize = @flaggedRegex;
if($flaggedSize == 0) {
#Uncomment below for debugging.
#print "No user defined regex foundn";
return;
}
open(READ_FILE, "<", $key . ".txt") or die("$!");
while(my $line = <READ_FILE>) {
push(@data, $line);
}
close(READ_FILE);
my $inputData = join("r", @data);
my $flaggedReg = join(" | ", @flaggedRegex);
my $updateRow = qq(UPDATE $tables[0] set data = ?, date = ?, regex = ? where pastekey = ?);
my $updateRowPrepare = $dbConnection->prepare($updateRow);
my $executeRowUpdate = $updateRowPrepare->execute($inputData, $date, $flaggedReg, $key);
if($executeRowUpdate < 0) {
print $DBI::errstr;
}
my $msg = qq(**Alert:** New paste found! The paste can be viewed via https://pastebin.com/$key - Regex Trigger: $flaggedReg);
open(SEND_ALERT, "-|", "curl https://localhost:8051/bots/bot_service_id/alert -k --data '{"text":"$msg"}' -H "Content-Type: application/json" -X POST") or die("$!");
close(SEND_ALERT);
$dbConnection->disconnect();
});
if($thread->is_running()) {
sleep(2);
}
if($thread->is_joinable()) {
$thread->join();
}
unlink $key . ".txt";
return;
}
multithreading io perl sqlite curl
New contributor
add a comment |
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I'd like to optimize my code and get a better of understanding of how I can perform the task I am doing better. I've only used Perl threads about 3 or 4 times now.
The purpose of my script / code block is to scrape the Pastebin API for every new public paste that appears at regular intervals using a crontab. Then search inside those new pastes for user defined regular expressions such as "google.com." The paste is then saved to a sqlite DB for later viewing and a notification is sent to a listening nodejs bot.
That being said here is my code block I wish to have improved upon:
sub threadCheckKey {
my ($url, $key, @regexs) = @_;
my $fullURL = $url.$key;
my @flaggedRegex = ();
my $date = strftime "%D", localtime;
my @data = ();
my $thread = threads->create(sub {
my $dbConnection = openDB();
open(GET_DATA, "-|", "curl -s " . $fullURL . " -k") or die("$!");
open(WRITE_FILE, ">", $key . ".txt") or die("$!");
while(my $line = <GET_DATA>) {
print WRITE_FILE $line;
foreach my $regex(@regexs) {
if($line =~ m/$regex/) {
if(!(grep(/$regex/, @flaggedRegex))) {
push(@flaggedRegex, $regex);
}
}
}
}
close(WRITE_FILE);
close(GET_DATA);
my $flaggedSize = @flaggedRegex;
if($flaggedSize == 0) {
#Uncomment below for debugging.
#print "No user defined regex foundn";
return;
}
open(READ_FILE, "<", $key . ".txt") or die("$!");
while(my $line = <READ_FILE>) {
push(@data, $line);
}
close(READ_FILE);
my $inputData = join("r", @data);
my $flaggedReg = join(" | ", @flaggedRegex);
my $updateRow = qq(UPDATE $tables[0] set data = ?, date = ?, regex = ? where pastekey = ?);
my $updateRowPrepare = $dbConnection->prepare($updateRow);
my $executeRowUpdate = $updateRowPrepare->execute($inputData, $date, $flaggedReg, $key);
if($executeRowUpdate < 0) {
print $DBI::errstr;
}
my $msg = qq(**Alert:** New paste found! The paste can be viewed via https://pastebin.com/$key - Regex Trigger: $flaggedReg);
open(SEND_ALERT, "-|", "curl https://localhost:8051/bots/bot_service_id/alert -k --data '{"text":"$msg"}' -H "Content-Type: application/json" -X POST") or die("$!");
close(SEND_ALERT);
$dbConnection->disconnect();
});
if($thread->is_running()) {
sleep(2);
}
if($thread->is_joinable()) {
$thread->join();
}
unlink $key . ".txt";
return;
}
multithreading io perl sqlite curl
New contributor
I'd like to optimize my code and get a better of understanding of how I can perform the task I am doing better. I've only used Perl threads about 3 or 4 times now.
The purpose of my script / code block is to scrape the Pastebin API for every new public paste that appears at regular intervals using a crontab. Then search inside those new pastes for user defined regular expressions such as "google.com." The paste is then saved to a sqlite DB for later viewing and a notification is sent to a listening nodejs bot.
That being said here is my code block I wish to have improved upon:
sub threadCheckKey {
my ($url, $key, @regexs) = @_;
my $fullURL = $url.$key;
my @flaggedRegex = ();
my $date = strftime "%D", localtime;
my @data = ();
my $thread = threads->create(sub {
my $dbConnection = openDB();
open(GET_DATA, "-|", "curl -s " . $fullURL . " -k") or die("$!");
open(WRITE_FILE, ">", $key . ".txt") or die("$!");
while(my $line = <GET_DATA>) {
print WRITE_FILE $line;
foreach my $regex(@regexs) {
if($line =~ m/$regex/) {
if(!(grep(/$regex/, @flaggedRegex))) {
push(@flaggedRegex, $regex);
}
}
}
}
close(WRITE_FILE);
close(GET_DATA);
my $flaggedSize = @flaggedRegex;
if($flaggedSize == 0) {
#Uncomment below for debugging.
#print "No user defined regex foundn";
return;
}
open(READ_FILE, "<", $key . ".txt") or die("$!");
while(my $line = <READ_FILE>) {
push(@data, $line);
}
close(READ_FILE);
my $inputData = join("r", @data);
my $flaggedReg = join(" | ", @flaggedRegex);
my $updateRow = qq(UPDATE $tables[0] set data = ?, date = ?, regex = ? where pastekey = ?);
my $updateRowPrepare = $dbConnection->prepare($updateRow);
my $executeRowUpdate = $updateRowPrepare->execute($inputData, $date, $flaggedReg, $key);
if($executeRowUpdate < 0) {
print $DBI::errstr;
}
my $msg = qq(**Alert:** New paste found! The paste can be viewed via https://pastebin.com/$key - Regex Trigger: $flaggedReg);
open(SEND_ALERT, "-|", "curl https://localhost:8051/bots/bot_service_id/alert -k --data '{"text":"$msg"}' -H "Content-Type: application/json" -X POST") or die("$!");
close(SEND_ALERT);
$dbConnection->disconnect();
});
if($thread->is_running()) {
sleep(2);
}
if($thread->is_joinable()) {
$thread->join();
}
unlink $key . ".txt";
return;
}
multithreading io perl sqlite curl
multithreading io perl sqlite curl
New contributor
New contributor
edited Nov 14 at 17:10
200_success
127k15148410
127k15148410
New contributor
asked Nov 14 at 16:28
falconspy
1162
1162
New contributor
New contributor
add a comment |
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
falconspy is a new contributor. Be nice, and check out our Code of Conduct.
falconspy is a new contributor. Be nice, and check out our Code of Conduct.
falconspy is a new contributor. Be nice, and check out our Code of Conduct.
falconspy is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207665%2fperl-pastebin-scraper%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown