Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added a lite option to remove content #2785

Merged
merged 3 commits into from
Oct 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 38 additions & 34 deletions seek/seek.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class Parser

# Custom OptionParser ScriptOptions
class ScriptOptions
attr_accessor :keyword, :location, :range, :worktype, :delay, :time, :print_total
attr_accessor :keyword, :location, :range, :worktype, :delay, :time, :print_total, :lite

def define_options(parser)
parser.banner = "Usage: #{Paint['seek.rb [options]', :red, :white]}"
Expand All @@ -72,6 +72,7 @@ def define_options(parser)
delay_execution_option(parser)
execute_at_time_option(parser)
print_total_number_option(parser)
lite_option(parser)

parser.separator ""
parser.separator "Common options:"
Expand Down Expand Up @@ -157,11 +158,22 @@ def print_total_number_option(parser)
end
end
end

def lite_option(parser)
parser.on("--lite [BOOLEAN]", "If BOOLEAN is true or 'yes', do not include the content column in the CSV") do |value|
self.lite = case value
when TrueClass, "yes", "Yes", "YES"
true
when FalseClass, NilClass, "no", "No", "NO"
false
else
value.to_s.casecmp("true").zero? || value.to_s.casecmp("yes").zero?
end
end
end

end

#
# Return a structure describing the options.
#
def parse(args)
# The options specified on the command line will be collected in
# *options*.
Expand Down Expand Up @@ -209,6 +221,10 @@ def parse(args)
print "Only print the total number of jobs found? (yes/no): "
options.print_total = $stdin.gets.chomp.casecmp("yes").zero?
end
if options.lite.nil?
print "Discard the content column in the results? (yes/no): "
options.lite = $stdin.gets.chomp.casecmp("yes").zero?
end

agent = Mechanize.new
agent.user_agent_alias = "Windows Chrome"
Expand All @@ -223,21 +239,9 @@ def parse(args)
["worktype", options.worktype]
]
)
results = []
results <<
[
"Title",
"URL",
"Advertiser",
"Location",
"Listing Date",
"Salary",
"Classification",
"Sub Classification",
# "Work Type",
"Short Description",
"Content"
]
results = [
["Title", "URL", "Advertiser", "Location", "Listing Date", "Salary", "Classification", "Sub Classification", "Short Description"] + (options.lite ? [] : ["Content"])
]

if options.print_total
# Using the CSS selector
Expand Down Expand Up @@ -276,21 +280,21 @@ def parse(args)
# listing_date = ad.at('dd[data-automation="job-detail-date"]').text if listing_date.empty?
get_script = ad.at('script[data-automation="server-state"]').text
salary = get_script.gsub(/(.*"jobSalary":")(.*?)(".*)/m, '\2') if salary.empty? && get_script.include?("jobSalary")
content = get_script.gsub(/(.*"content\(\{\\"platform\\":\\"WEB\\"\}\)":")(.*?)(".*)/m, '\2')
results <<
[
title,
url,
advertiser,
location,
listing_date,
salary,
classification,
sub_classification,
# work_type,
short_description,
content
]
content = options.lite ? nil : get_script.gsub(/(.*"content\(\{\\"platform\\":\\"WEB\\"\}\)":")(.*?)(".*)/m, '\2')
resultsrow = [
title,
url,
advertiser,
location,
listing_date,
salary,
classification,
sub_classification,
# work_type,
short_description,
]
resultsrow << content unless options.lite
results << resultsrow
end

if (link = page.link_with(text: "Next")) # As long as there is still a next page link
Expand Down
Loading