diff --git a/ignore-list b/ignore-list index 857365c..eab1c26 100644 --- a/ignore-list +++ b/ignore-list @@ -1,168 +1,2 @@ -https://www.reddit.com/ -https://www.redditstatic.com/desktop2x/img/snoo-upvote.png -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e50c34178d20d5fa4ab3c1f6c67901a9.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-0b0b9b2b7159c9bc6463e7ab3b0e8bd0.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-5267af566ab853eb9d74db1a78a46c67.woff -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-b85bf848c28799f5ad34ee29db68051c.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-5a241c76c24e463ef9bcc5855d20209b.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-255b4934a1f414dd312aa89382d65114.woff -https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-b16bb0524a7e7ee597970333c0c67180.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff -https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-29d72243d2cd6145b28bcb80dc33f0e4.woff2 -https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-06ee3f893717454d11a16c3e8d0aa9f9.woff -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-116bb6d508f5307861d3b1269bc597e7.woff2 -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-c4b185e25a4dde85a29f902cd5ce5360.woff2 -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-1051a531d3e1ee3483a6533158557139.woff -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-875de5047556e7c822519d95d7ee692d.woff2 -https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff -https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.eot -https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.woff -https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.ttf -https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.svg -https://www.redditstatic.com/desktop2x/Legacy~runtime~Reddit.97787cdd6c63f5bae0e3.js -https://www.redditstatic.com/desktop2x/Legacy~RedesignContentFonts.b488720bff09b2af6ac0.js -https://www.redditstatic.com/desktop2x/Legacy~RedesignSystemFonts.6f01c338e1546906d45d.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Poll~Reddit~RedesignChat.b7dece05c8943cdea084.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Governance~Reddit.91fc1d178146f81897d1.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Governance~Reddit.b2f8d4f01894b2592d5b.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Client~Governance~Reddit.f0c54af17c78c7813e02.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Gifts~Poll~Reddit.465b9d2661dafd9d1341.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Governance~Reddit.faa615b571967bf75e18.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Reddit.2d2f290c34f50d2de6eb.js -https://www.redditstatic.com/desktop2x/Legacy~Governance~Profile~ProfileHomepage~ProfilePostComments~R2CommentsPage~R2Listing~Reddit.1e5b73bb32dd79ae00af.js -https://www.redditstatic.com/desktop2x/Legacy~ChatPage~Client~Gifts~Governance~Reddit.1b90b6f863290aa3e6d6.js -https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Gifts~Governance~Reddit.c28cb0086d650fcaf481.js -https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Governance~Reddit.8b3e1712da474ac3ca10.js -https://www.redditstatic.com/desktop2x/Legacy~Chat~Governance~Reddit~RedesignChat.f6ae3f8ed29ed729ce89.js -https://www.redditstatic.com/desktop2x/Legacy~Client~Governance~Reddit.342d915109c0445b1569.js -https://www.redditstatic.com/desktop2x/Legacy~Governance~Reddit.4be452fa5e67799d348a.js -https://www.redditstatic.com/desktop2x/Legacy~Reddit.4a01b13741b20152b396.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPag~2698b78e.7a3127ba5ec30ed4e1c0.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~CollectionCommentsPage~CommentsPage~Frontpage~PostCreation~RedesignChat~RichTextEditor~~f6a0790c.b36927a082075e6f9b86.js -https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage.f88935fd981121681471.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GlobalModalContainer~GovernanceReleaseNotesMod~6b4ca950.94d657605a47d3304551.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~db251346.e51c88db6b855beb60a4.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Frontpage~ModListing~ModQueuePages~ModerationPages~Multireddit~N~0ef8faec.67b48672a7164c7362bc.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~1084d5fc.a27786f9cac978bb4a0a.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~fc7712a4.441c314ec50477ea7912.js -https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~d27514f2.1da837e692736e5e6c43.js -https://www.redditstatic.com/desktop2x/Legacy~CommentsPage.51d37b5d4496c188cc12.js -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-57x57.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-60x60.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-72x72.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-76x76.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-114x114.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-144x144.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-152x152.png -https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-180x180.png -https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png -https://www.redditstatic.com/desktop2x/img/favicon/favicon-32x32.png -https://www.redditstatic.com/desktop2x/img/favicon/favicon-96x96.png -https://www.redditstatic.com/desktop2x/img/favicon/favicon-16x16.png -https://www.redditstatic.com/desktop2x/img/favicon/manifest.json -https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png -https://www.redditstatic.com/desktop2x/js/ads.js -https://www.redditstatic.com/desktop2x/Legacy~Subreddit.32e5fa17c24840b8cbca.js -https://www.redditstatic.com/desktop2x/Legacy~Frontpage.cae047d3c2afb9e86a73.js -https://www.redditstatic.com/desktop2x/chunkCSS/Reddit.583754539e6661085608.css -https://www.redditstatic.com/desktop2x/img/gold/badges/award-gold-medium.png?v=2 -https://about.reddit.com/ -https://www.redditinc.com/ -http://www.w3.org/2000/svg -https://www.redditstatic.com/desktop2x/img/favicon/ms-icon-144x144.png -https://s.imgur.com/min/sharePlayer.css?1554398656 -https://s.imgur.com/min/imageViewerInline.js?1554398656 -https://i.imgur.com/favicon.ico -https://imgur.com/favicon.ico -https://s.imgur.com/min/sharePlayer.js?1554398656 -https://s.imgur.com/images/share-player-ffbg.png -https://s.imgur.com/images/loaders/ddddd1_181817/24.gif -https://s.imgur.com/images/favicon-32x32.png -https://s.imgur.com/images/favicon-96x96.png -https://s.imgur.com/images/favicon-16x16.png -https://s.imgur.com/min/global.css?1554398656 -https://s.imgur.com/min/gallery.css?1554398656 -https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js -https://s.imgur.com/include/js/ext/jquery.2.1.1.min.js -https://s.imgur.com/min/react15.js?1554398656 -https://s.imgur.com/min/global.js?1554398656 -https://s.imgur.com/min/advertising.js?1554398656 -https://s.imgur.com/min/px.js?ch=1 -https://s.imgur.com/min/px.js?ch=2 -https://s.imgur.com/min/runSlots.js?1554398656 -https://s.imgur.com/min/gallery.js?1554398656 -https://s.imgur.com/include/fonts/imgur.eot?7 -https://s.imgur.com/include/fonts/imgur.woff?7 -https://s.imgur.com/include/fonts/imgur.ttf?7 -https://s.imgur.com/include/fonts/imgur.svg?7 -https://s.imgur.com/include/fonts/proxima-nova-regular.eot -https://s.imgur.com/include/fonts/proxima-nova-regular.eot? -https://s.imgur.com/include/fonts/proxima-nova-regular.woff2 -https://s.imgur.com/include/fonts/proxima-nova-regular.woff -https://s.imgur.com/include/fonts/proxima-nova-regular.ttf -https://s.imgur.com/include/fonts/proxima-nova-regular.svg -https://s.imgur.com/include/fonts/proxima-nova-bold.eot -https://s.imgur.com/include/fonts/proxima-nova-bold.eot? -https://s.imgur.com/include/fonts/proxima-nova-bold.woff2 -https://s.imgur.com/include/fonts/proxima-nova-bold.woff -https://s.imgur.com/include/fonts/proxima-nova-bold.ttf -https://s.imgur.com/include/fonts/proxima-nova-bold.svg -https://s.imgur.com/images/site-sprite.png?1430420391 -https://s.imgur.com/images/button-icons.png -https://s.imgur.com/images/imgur-logo.svg?1 -https://s.imgur.com/images/svg/comment-notification.svg -https://s.imgur.com/images/svg/stars-notoriety.svg -https://s.imgur.com/images/house-cta/cta-background.jpg -https://s.imgur.com/images/house-cta/snowflake1.png -https://s.imgur.com/images/house-cta/snowflake2.png -https://s.imgur.com/images/house-cta/snowflake3.png -https://s.imgur.com/images/loaders/181817_ffffff/48.gif -https://s.imgur.com/images/house-cta/cta-sms-stars.png -https://s.imgur.com/images/house-cta/cta-sms-phone.png -https://s.imgur.com/images/icons/close-outline.svg -https://s.imgur.com/images/datepicker/datepicker_t.png -https://s.imgur.com/images/datepicker/datepicker_b.png -https://s.imgur.com/images/datepicker/datepicker_l.png -https://s.imgur.com/images/datepicker/datepicker_r.png -https://s.imgur.com/images/datepicker/datepicker_tl.png -https://s.imgur.com/images/datepicker/datepicker_tr.png -https://s.imgur.com/images/datepicker/datepicker_bl.png -https://s.imgur.com/images/datepicker/datepicker_br.png -https://s.imgur.com/images/imgur.gif -https://s.imgur.com/images/loaders/ddddd1_181817/48.gif -https://s.imgur.com/images/loaders/ddddd1_2b2b2b/24.gif -https://s.imgur.com/images/tipsy.png -https://s.imgur.com/include/magnify.cur -https://s.imgur.com/images/icons/icon-cake.svg -https://s.imgur.com/images/loaders/ddddd1_2b2b2b/48.gif -https://s.imgur.com/images/loaders/ddddd1_121211/48.gif -https://s.imgur.com/images/report-ad-layouts.png -https://s.imgur.com/images/radiobox_checkmark_small.png -https://s.imgur.com/images/icons/volume-high.png -https://s.imgur.com/images/icons/volume-mute.png -https://s.imgur.com/images/icons/full-screen.png -https://s.imgur.com/images/icons/full-screen-minimize.png -https://s.imgur.com/images/reactionGifPromoAsset.png -https://s.imgur.com/images/calendar.png -https://imgur.com/6JayaOr.png? -https://i.imgur.com/6JayaOr.png? -https://s.imgur.com/images/buttons-sprite.png -https://s.imgur.com/images/giraffe-tophat.png -https://s.imgur.com/images/icons/Teal-Folder.svg -https://s.imgur.com/images/icons/Outline-Folder.svg -https://s.imgur.com/images/loaders/ddddd1_121211/16.gif -https://s.imgur.com/images/fp-edit.png -https://s.imgur.com/images/dot-dot-dot.svg -http://www.redditblog.com/ -https://www.redditblog.com/ -https://redditblog.com/ -https://www.reddithelp.com/ -https://www.reddithelp.com/en -http://redditgifts.com/ -https://www.redditgifts.com/ -https://www.reddithelp.com/ -https://about.reddit.com/ +https://old.reddit.com/static/opensearch.xml +https://reddit.com/static/pixel.png diff --git a/pipeline.py b/pipeline.py index 0458057..72ab202 100644 --- a/pipeline.py +++ b/pipeline.py @@ -54,8 +54,8 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20200701.01' -USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; WOW64; Trident/4.0; SLCC1)' +VERSION = '20200726.01' +USER_AGENT = 'Archive Team' TRACKER_ID = 'reddit' TRACKER_HOST = 'trackerproxy.meo.ws' @@ -216,6 +216,7 @@ class WgetArgs(object): '--timeout', '30', '--tries', 'inf', '--domains', 'reddit.com', + '--header', 'Cookie: over18=1', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), @@ -237,18 +238,21 @@ class WgetArgs(object): ]) item_name = item['item_name'] - item_type, item_value = item_name.split(':', 1) + item_type, item_value = item_name.split('.', 1) item['item_type'] = item_type item['item_value'] = item_value - if item_type == 'posts': + if item_type in ('posts', 'comments'): start, end = item_value.split('-') for i in range(int(start), int(end)+1): post_id = self.int_to_str(i) - wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)]) - wget_args.append('https://www.reddit.com/comments/{}'.format(post_id)) - #wget_args.append('https://old.reddit.com/comments/{}'.format(post_id)) + if item_type == 'posts': + wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)]) + wget_args.append('https://www.reddit.com/api/info.json?id=t3_{}'.format(post_id)) + elif item_type == 'comments': + wget_args.extend(['--warc-header', 'reddit-comment: {}'.format(post_id)]) + wget_args.append('https://www.reddit.com/api/info.json?id=t1_{}'.format(post_id)) else: raise Exception('Unknown item') diff --git a/reddit.lua b/reddit.lua index 88d473d..1c4fbd9 100644 --- a/reddit.lua +++ b/reddit.lua @@ -15,6 +15,7 @@ local abortgrab = false local posts = {} local requested_children = {} +local thumbs = {} for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true @@ -46,19 +47,50 @@ processed = function(url) return false end -allowed = function(url, parenturl, source) +allowed = function(url, parenturl) + local match = string.match(url, "^https?://[^%.]+%.thumbs%.redditmedia%.com/([^%.]+)%.") + if match + and parenturl + and string.match(parenturl, "^https?://www%.reddit%.com/api/info%.json%?id=") then + thumbs[match] = true + end + + if match and not thumbs[match] then + return false + end + if string.match(url, "'+") - or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") - or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") - or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") - or string.match(url, "^https?://[^/]*reddit%.com/login") - or string.match(url, "^https?://[^/]*reddit%.com/register") - or string.match(url, "%?sort=") - or string.match(url, "^https?://[^/]*reddit%.app%.link/") - or string.match(url, "^https?://out%.reddit%.com/r/") - or (string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/")) - or string.match(url, "/%.rss$") - or (parenturl and string.match(url, "^https?://amp%.reddit%.com/")) then + or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") + or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") + or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") + or string.match(url, "^https?://[^/]*reddit%.com/login") + or string.match(url, "^https?://[^/]*reddit%.com/register") + or string.match(url, "%?sort=") + or string.match(url, "%?limit=500$") + or string.match(url, "%?ref=readnext$") + or string.match(url, "^https?://[^/]*reddit%.app%.link/") + or string.match(url, "^https?://out%.reddit%.com/r/") + or string.match(url, "^https?://emoji%.redditmedia%.com/") + or string.match(url, "^https?://styles%.redditmedia%.com/") + or string.match(url, "^https?://[^%.]+%.redd%.it/award_images/") + or ( + string.match(url, "^https?://gateway%.reddit%.com/") + and not string.match(url, "/morecomments/") + ) + or string.match(url, "/%.rss$") + or ( + parenturl + and string.match(url, "^https?://amp%.reddit%.com/") + ) + or ( + item_type == "posts" + and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$") + ) + or ( + parenturl + and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") + and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") + ) then return false end @@ -77,10 +109,13 @@ allowed = function(url, parenturl, source) return false end - if string.match(url, "^https?://[^/]*redditmedia%.com/") - or string.match(url, "^https?://www%.reddit%.com/api/morechildren$") - or string.match(url, "^https?://v%.redd%.it/[^/]+/[^/]+$") - or string.match(url, "^https?://preview%.redd%.it/[^/]+/[^/]+$") then + if (string.match(url, "^https?://[^/]*redditmedia%.com/") + or string.match(url, "^https?://old%.reddit%.com/api/morechildren$") + or string.match(url, "^https?://v%.redd%.it/") + or string.match(url, "^https?://i%.redd%.it/") + or string.match(url, "^https?://[^%.]*preview%.redd%.it/.") + ) + and not string.match(item_type, "comment") then return true end @@ -89,16 +124,6 @@ allowed = function(url, parenturl, source) return true end end - - if parenturl - and string.match(parenturl, "^https?://www%.reddit%.com/") - and source ~= "download_child_p" - and not string.match(url, "^https?://[^/]*reddit%.com/") - and not string.match(url, "^https?://[^/]*youtube%.com") - and not string.match(url, "^https?://[^/]*youtu%.be") - and not string.match(url, "^https?://[^/]*redd%.it/") then - return true - end return false end @@ -107,18 +132,27 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] - if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") then + if item_type == "comments" then + return false + end + + if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") + or string.match(url, "^https?://[^/]*redditstatic%.com/") + or string.match(url, "^https?://old%.reddit%.com/static/") + or string.match(url, "^https?://www%.reddit%.com/static/") + or string.match(url, "^https?://styles%.redditmedia%.com/") + or string.match(url, "^https?://emoji%.redditmedia%.com/") + or string.match(url, "/%.rss$") then return false end - if string.match(parent["url"], "^https?://www%.reddit%.com/comments/[a-z0-9]+") then + if string.match(parent["url"], "^https?://old%.reddit%.com/comments/[a-z0-9]+") then return true end if not processed(url) - and (allowed(url, parent["url"], "download_child_p") or (allowed(parent["url"], nil, "download_child_p") and html == 0)) then + and (allowed(url, parent["url"]) or (allowed(parent["url"]) and html == 0)) then addedtolist[url] = true -print('b ' .. html .. ' ' .. url) return true end @@ -137,12 +171,11 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&", "&") if not processed(url_) and string.match(url_, "^https?://.+") - and allowed(url_, origurl, "get_urls") + and allowed(url_, origurl) and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true -print('a ' .. url) end end @@ -170,79 +203,100 @@ print('a ' .. url) if string.match(newurl, "^%?") then check(string.match(url, "^(https?://[^%?]+)") .. newurl) elseif not (string.match(newurl, "^https?:\\?/\\?//?/?") - or string.match(newurl, "^[/\\]") - or string.match(newurl, "^%./") - or string.match(newurl, "^[jJ]ava[sS]cript:") - or string.match(newurl, "^[mM]ail[tT]o:") - or string.match(newurl, "^vine:") - or string.match(newurl, "^android%-app:") - or string.match(newurl, "^ios%-app:") - or string.match(newurl, "^%${")) then + or string.match(newurl, "^[/\\]") + or string.match(newurl, "^%./") + or string.match(newurl, "^[jJ]ava[sS]cript:") + or string.match(newurl, "^[mM]ail[tT]o:") + or string.match(newurl, "^vine:") + or string.match(newurl, "^android%-app:") + or string.match(newurl, "^ios%-app:") + or string.match(newurl, "^data:") + or string.match(newurl, "^%${")) then check(string.match(url, "^(https?://.+/)") .. newurl) end end - if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[a-z0-9]+") then - posts[string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([a-z0-9]+)")] = true + if string.match(url, "^https?://www%.reddit%.com/") then + check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) + --elseif string.match(url, "^https?://old%.reddit%.com/") then + -- check(string.gsub(url, "^https?://old%.reddit%.com/", "https://www.reddit.com/")) end - if allowed(url, nil, "get_urls") - and status_code < 300 - and not string.match(url, "^https?://[^/]*redditmedia%.com/") - and not string.match(url, "^https?://[^/]*redditstatic%.com/") - and not string.match(url, "^https?://out%.reddit%.com/") - and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts$") - and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*$") then + if allowed(url) + and status_code < 300 + and not string.match(url, "^https?://[^/]*redditmedia%.com/") + and not string.match(url, "^https?://[^/]*redditstatic%.com/") + and not string.match(url, "^https?://out%.reddit%.com/") + and not string.match(url, "^https?://[^%.]*preview%.redd%.it/") + and not string.match(url, "^https?://i%.redd%.it/") + and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts") + and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.mp4") then html = read_file(file) - if string.match(url, "^https?://www%.reddit%.com/api/morechildren$") then + if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') - elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/") - or string.match(url, "^https?://www%.reddit%.com/r/[^/]+/duplicates/") then + elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") + or string.match(url, "^https?://old%.reddit%.com/r/[^/]+/duplicates/") then html = string.gsub(html, "%s*.-%s*%s*%s*", "") end - if string.match(url, "^https?://www%.reddit%.com/") then - for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*[0-9]+,%s*'[^']+'%))") do - local link_id, sort, children, depth, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*([0-9]+),%s*'([^']+)'%)$") + if string.match(url, "^https?://old%.reddit%.com/") then + for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*'[^']+'%))") do + local link_id, sort, children, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*'([^']+)'%)$") local id = string.match(children, "^([^,]+)") local subreddit = string.match(html, 'data%-subreddit="([^"]+)"') - local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&depth=" .. depth .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html" - if requested_children[post_data] == nil then + local post_data = + "link_id=" .. link_id .. + "&sort=" .. sort .. + "&children=" .. string.gsub(children, ",", "%%2C") .. + "&id=t1_" .. id .. + "&limit_children=" .. limit_children .. + "&r=" .. subreddit .. + "&renderstyle=html" + if not requested_children[post_data] then requested_children[post_data] = true - table.insert(urls, {url="https://www.reddit.com/api/morechildren", + table.insert(urls, {url="https://old.reddit.com/api/morechildren", post_data=post_data}) end end - --elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") - -- or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") - -- or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then - -- for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do - -- local post_data = '{"token":"' .. s .. '"}' - -- local comment_id = nil - -- if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") then - -- comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)") - -- elseif string.match(url, "^https?://www%.reddit%.com/comments/[^/]") then - -- comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)") - -- elseif string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then - -- comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)") - -- end - -- if requested_children[post_data] == nil then - -- requested_children[post_data] = true - -- table.insert(urls, {url="https://gateway.reddit.com/desktopapi/v1/morecomments/t3_" .. comment_id .. "?rtj=only&allow_over18=1&include=", - -- post_data=post_data}) - -- end - -- end + elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") + or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") + or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then + for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do + local post_data = '{"token":"' .. s .. '"}' + local comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)") + if comment_id == nil then + comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)") + end + if comment_id == nil then + comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)") + end + if comment_id == nil then + print("Could not find comment ID.") + abortgrab = true + end + if not requested_children[post_data] then + requested_children[post_data] = true + table.insert(urls, {url= + "https://gateway.reddit.com/desktopapi/v1/morecomments/t3_" .. comment_id .. + "?emotes_as_images=true" .. + "&rtj=only" .. + "&allow_over18=1" .. + "&include=", + post_data=post_data + }) + end + end end if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do check("https?://www.reddit.com" .. s) end end - if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.mpd$") then + if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.mpd") then for s in string.gmatch(html, "([^<]+)") do checknewshorturl(s) end end - if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.m3u8$") then + if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.m3u8") then for s in string.gmatch(html, "(.-)\n") do if not string.match(s, "^#") then checknewshorturl(s) @@ -279,6 +333,11 @@ wget.callbacks.httploop_result = function(url, err, http_stat) io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n") io.stdout:flush() + local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$") + if match then + posts[match] = true + end + if status_code == 204 then return wget.actions.EXIT end @@ -292,25 +351,17 @@ wget.callbacks.httploop_result = function(url, err, http_stat) elseif not string.match(newloc, "^https?://") then newloc = string.match(url["url"], "^(https?://.+/)") .. newloc end - if downloaded[newloc] == true or addedtolist[newloc] == true then + if processed(newloc) or not allowed(newloc, url["url"]) then return wget.actions.EXIT end end - - if downloaded[url["url"]] and http_stat["rderrmsg"] then - io.stdout:write("Url was already downloaded.\n") - io.stdout:write(http_stat["rderrmsg"] .. "\n") - io.stdout:write("Skipping URL.\n") - io.stdout:flush() - return wget.actions.EXIT - end if (status_code >= 200 and status_code <= 399) then downloaded[url["url"]] = true downloaded[string.gsub(url["url"], "https?://", "http://")] = true end - if abortgrab == true then + if abortgrab then io.stdout:write("ABORTING...\n") return wget.actions.ABORT end @@ -321,23 +372,26 @@ wget.callbacks.httploop_result = function(url, err, http_stat) io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. "). Sleeping.\n") io.stdout:flush() local maxtries = 8 - if not allowed(url["url"], nil, "httploop_result") then + if not allowed(url["url"]) then maxtries = 0 end if tries >= maxtries then io.stdout:write("\nI give up...\n") io.stdout:flush() tries = 0 - if allowed(url["url"], nil, "httploop_result") then + if allowed(url["url"]) then return wget.actions.ABORT else return wget.actions.EXIT end - else - os.execute("sleep " .. math.floor(math.pow(2, tries))) - tries = tries + 1 - return wget.actions.CONTINUE end + os.execute("sleep " .. math.floor(math.pow(2, tries))) + tries = tries + 1 + return wget.actions.CONTINUE + end + + if string.match(url["url"], "^https?://[^/]+%.reddit%.com/api/info%?id=t[0-9]_[a-z0-9]+$") then + return wget.actions.EXIT end tries = 0 @@ -352,7 +406,7 @@ wget.callbacks.httploop_result = function(url, err, http_stat) end wget.callbacks.before_exit = function(exit_status, exit_status_string) - if abortgrab == true then + if abortgrab then return wget.exits.IO_FAIL end return exit_status