Version 20230614.02. Extra validity checks.

pull/17/head
arkiver 1 year ago
parent e84e804fc5
commit be6e32cba5

@ -59,7 +59,7 @@ if not WGET_AT:
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20230614.01'
VERSION = '20230614.02'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 40

@ -685,6 +685,7 @@ wget.callbacks.write_to_warc = function(url, http_stat)
or has_video
or string.match(html, "v%.redd%.it")
or string.match(html, "reddit_video") then
print("Not writing to WARC.")
abort_item()
return false
end
@ -707,6 +708,7 @@ wget.callbacks.write_to_warc = function(url, http_stat)
string.match(url["url"], "^https?://old%.reddit%.com/api/morechildren$")
and not JSON:decode(html)["success"]
) then
print("Not writing to WARC.")
retry_url = true
return false
end
@ -715,6 +717,9 @@ wget.callbacks.write_to_warc = function(url, http_stat)
if (
string.match(url["url"], "^https?://[^/]+/svc/")
and string.match(html, 'level%s*=')
) or (
string.match(url["url"], "^https?://old%.reddit%.com/r/")
and not string.match(html, 'class="live%-timestamp"')
) or (
string.match(url["url"], "^https?://www%.reddit%.com/r/")
and (
@ -727,14 +732,14 @@ wget.callbacks.write_to_warc = function(url, http_stat)
is_comments_comment
or item_type ~= "comment"
)
and not string.match(html, "<shreddit%-title")
and (
not string.match(html, "<shreddit%-title")
or not string.match(html, 'id="time%-ago%-separator"')
)
)
)
) then
io.stdout:write("Reddit has a problem for URL " .. url["url"] .. ". Pausing 120 seconds and aborting.\n")
io.stdout:flush()
os.execute("sleep 120")
killgrab = true
print("Not writing to WARC.")
retry_url = true
return false
end

Loading…
Cancel
Save