from k1lib.imports import *
/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0 warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
2023-12-04 02:01:36,528 INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-04 02:01:36,535 INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265
I'm still so deep into Touhou, and there's a series on the Touhou subreddit called Gensokyo days that's pretty nice. Problem is, it gets too slow to scroll all the way to the first stories. So I want to crawl reddit lightly to grab those stories, both for me to read, and to provide a simpler example using the browser automation tool than last time.
url = "https://www.reddit.com/user/BlackDragonTribe/"
b = zircon.newBrowser()
await b.scan("mint3") | shape()
Connecting to server... Connected
(10, 28)
await b.pickExt(await b.scan("mint3") | grep("_30") | item())
'ok'
await b.goto("https://www.reddit.com/user/BlackDragonTribe/")
e = await b.querySelector(".rpBJOHq2PR60pnwJlUyP0"); e
<Element selector='.rpBJOHq2PR60pnwJlUyP0' browser=_ext_158826568_1701652845_30>
posts = await e.children() | k1.Wrapper()
await (posts() | op().value("outerHTML").all() | deref() | ~aS(asyncio.gather)) | k1.Wrapper() | aS(dill.dumps) | file("postBodies.pth")
'postBodies.pth'
from k1lib.imports import *; import bs4
postBodies = cat.pickle("postBodies.pth") | item()
/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0 warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
2023-12-03 21:42:05,516 INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-03 21:42:05,523 INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265
with k1.ignoreWarnings(): postBs = postBodies() | apply(bs4.BeautifulSoup) | deref(1) | k1.Wrapper()
def refinePost(postB):
href = postB.find_all("a") | filt(op().find_all("h3")) | item() | op().attrs["href"]
title = postB.find_all("h3") | item()
return [title, href]
postMeta = postBs() | apply(refinePost) | filt(op().startswith("/r/touhou"), 1) | deref() | k1.Wrapper()
postMeta() | display() & shape() | deref()
['421: "The first, but not the last"'] /r/touhou/comments/189swmw/421_the_first_but_not_the_last/ ['420: "Down the rabbit hole"'] /r/touhou/comments/1894ce9/420_down_the_rabbit_hole/ ['long hair Nazrin'] /r/touhou/comments/188se3i/long_hair_nazrin/ ['419: "The anticipation is killing me"'] /r/touhou/comments/188djoy/419_the_anticipation_is_killing_me/ ['418: "Who didn\'t see that coming"'] /r/touhou/comments/187izd5/418_who_didnt_see_that_coming/ ['Gym Reimu'] /r/touhou/comments/187cut6/gym_reimu/ ['retro.exe (shart)'] /r/touhou/comments/17yhgkv/retroexe_shart/ ['417: "The most important thing"'] /r/touhou/comments/186rsbn/417_the_most_important_thing/ ['158: Yakumo Family Dynamics'] /r/touhou/comments/186jy94/158_yakumo_family_dynamics/ ['bed hair'] /r/touhou/comments/186lgpg/bed_hair/
[None, (654, 2, 1, 34)]
Now let's actually do the crawling. Goal is to grab all image tags from the page.
b = zircon.newBrowser()
await b.pickExt(await b.scan("mint3") | item())
Connecting to server... Connected
'ok'
await b.scan(["mint2", "mint3"]) | shape()
(30, 28)
await b.goto("/r/touhou/comments/1894ce9/420_down_the_rabbit_hole/")
{}
bs4.BeautifulSoup(await (await b.querySelector("body")).value("innerHTML")).find_all("img") | op().attrs.get("src", None).all() | filt("x") | filt(op().startswith("http")) | deref()
['https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png', 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_2.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_0.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_1.png', 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png', 'https://i.redd.it/jw2x7zo81w3c1.png', 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png', 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png?width=256&s=c45b37c7ea9db1e4403635d410faf12a57b741bc']
Ok nice. Let's do it
b"" | file("gd.pth")
'gd.pth'
goingToVisit = postMeta() | cut(1) | deref() | aS(deque) | k1.Wrapper()
async def crawl(b:"zircon.Browser"):
if len(goingToVisit()) == 0: raise zircon.BrowserCancel()
url = goingToVisit().popleft()
try:
startTime = time.time()
await b.goto(f"https://www.reddit.com{url}", 60)
title = await (await b.querySelector("title")).value("innerHTML")
body = await (await b.querySelector("body")).value("innerHTML")
imgs = bs4.BeautifulSoup(body).find_all("img") | op().attrs.get("src", None).all() | filt("x") | filt(op().startswith("http")) | deref()
# below is guaranteed to work and will not be interrupted
[startTime, time.time(), url, title, imgs] | aS(dill.dumps) >> file("gd.pth")
except Exception as e: goingToVisit().append(url)
bg = zircon.BrowserGroup(["mint2", "mint3"], 100)
await bg.execute(crawl, 30)
Executing. #browsers=30 #running=0 #tasks=0 Task finished
from k1lib.imports import *
/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0 warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
2023-12-04 01:14:53,215 INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-04 01:14:53,223 INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265
cat.pickle("gd.pth") | item() & shape() | deref()
[[1701657743.621378, 1701657752.7771413, '/r/touhou/comments/17y8f13/148_based_meme/', '148: Based Meme : touhou', ['https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png', 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_2.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_1.png', 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_7.png', 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png', 'https://preview.redd.it/nfiadn3ig41c1.png?width=640&crop=smart&auto=webp&s=3730200dd1caf0ac3056cb5a7eff61d248c5caae', 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png', 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png?width=256&s=c45b37c7ea9db1e4403635d410faf12a57b741bc']], (641, 5)]
Total time (minutes):
cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | reverse() | item() | op()/60
4.252806413173675
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
plt.sca(axes[0]); cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | window(2) | ~apply(lambda x,y: y-x) | filtStd() | filtStd() | deref() | aS(plt.hist, bins=30)
plt.xlabel("Time between 2 consecutive page scans (s)"); plt.ylabel("Frequency");
plt.sca(axes[1]); cat.pickle("gd.pth") | cut(0, 1) | ~apply(lambda x,y: y-x) | deref() | aS(plt.hist, bins=30)
plt.xlabel("Time to extract all data out of the page (s)"); plt.ylabel("Frequency"); plt.tight_layout()
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
plt.sca(axes[0]); cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | window(2) | ~apply(lambda x,y: y-x) | insertIdColumn() | T() | smooth(10).all() | deref() | ~aS(plt.plot)
plt.xlabel("#pages"); plt.ylabel("Time between 2 consecutive scans (s)"); plt.grid(True)
plt.sca(axes[1]); cat.pickle("gd.pth") | cut(0, 1) | ~apply(lambda x,y: y-x) | insertIdColumn() | T() | smooth(10).all() | deref() | ~aS(plt.plot)
plt.xlabel("#pages"); plt.ylabel("Time to extract all data out of the page (s)"); plt.grid(True); plt.tight_layout()
Seems pretty stable over time! No perf degradation like last time.
cat.pickle("gd.pth") | cut(4) | shape(0).all() | deref() | aS(plt.hist, bins=30);
plt.xlabel("#images/page"); plt.ylabel("Frequency");
Total images (duplicates allowed):
cat.pickle("gd.pth") | cut(4) | joinSt() | shape(0)
8450
Total images (duplicates not allowed):
cat.pickle("gd.pth") | cut(4) | joinSt() | aS(set) | shape(0)
693
Much smaller! Okay let's only get the unique images from each page:
cat.pickle("gd.pth") | shape()
(641, 5)
imgs = cat.pickle("gd.pth") | cut(2, 4) | ungroup() | groupBy(1, True) | apply(joinSt(), 1) | deref() | filt("len(x) == 1", 1) | ungroup() | permute(1, 0) | sort(0, False) | deref()
imgs | display(20)
/r/touhou/comments/100c9ui/happy_new_year_rtouhou/ https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec /r/touhou/comments/100khxk/130_age_old_question/ https://i.redd.it/rfk1bldjwf9a1.png /r/touhou/comments/101fbft/ex_008_the_strongest_fairy/ https://i.redd.it/u5h9j2h6fn9a1.png /r/touhou/comments/103460g/132_love_is_in_the_airwaves/ https://i.redd.it/duk5ymkd71aa1.png /r/touhou/comments/1040y8j/133_in_need_of_assistance/ https://i.redd.it/as386nlao8aa1.png /r/touhou/comments/104ufea/134_its_funnier_that_way/ https://i.redd.it/hk4fr1jzefaa1.png /r/touhou/comments/106j8gj/135_myon_in_the_mirror/ https://i.redd.it/16ck426nmtaa1.png /r/touhou/comments/107ebtp/136_just_an_average_day/ https://i.redd.it/qmbhf216r0ba1.png /r/touhou/comments/108d1k0/137_she_got_there_in_the_end/ https://i.redd.it/9zsevr71m8ba1.png /r/touhou/comments/1097ggt/138_maintenance/ https://i.redd.it/qwvdiu8hnfba1.png /r/touhou/comments/10axlb6/139_i_always_feel_like/ https://i.redd.it/e54myusdztba1.png /r/touhou/comments/10bofy6/140_multiplying/ https://i.redd.it/1rkn7r7eg0ca1.png /r/touhou/comments/10dfmi2/141_playing_both_sides/ https://i.redd.it/bpszik1tyeca1.png /r/touhou/comments/10efe6l/142_marital_issues/ https://i.redd.it/xvxxmbsnjmca1.png /r/touhou/comments/10f8gsx/143_sky_high/ https://i.redd.it/nsp2iuk29tca1.png /r/touhou/comments/10g2ulf/144_masterpiece/ https://i.redd.it/bc8qzt71e0da1.png /r/touhou/comments/10h0n3h/145_burning_desire/ https://i.redd.it/jszdy82q18da1.png /r/touhou/comments/10hv6lp/146_remodel/ https://i.redd.it/izj22d2sbfda1.png /r/touhou/comments/10k7pwk/148_moon_with_a_view/ https://i.redd.it/6bz5ic2jc0ea1.png /r/touhou/comments/10kymz4/149_doctor_is_in/ https://i.redd.it/b3bh41poy6ea1.png
Ok nice! Let's quickly download all of them:
imgs | cut(1) | iden() & applyMp(tryout() | cat(text=False), timeout=60, prefetch=10) | transpose() | tee().autoInc() | apply(dill.dumps) | file("imgs.pth")
663) 663, 56s elapsed
'imgs.pth'
That was quite fast! What's the resolution?
whs = cat.pickle("imgs.pth") | apply(toImg() | shape(), 1) | deref() | k1.Wrapper()
whs() | shape()
(664, 2, 115)
whs() | cut(1) | T() | (sketch(['plt.xlabel("Value")', 'plt.ylabel("Frequency")', 'plt.yscale("log")'], ["Width", "Height"]) | deref() | aS(plt.hist, bins=30))
Let's see a few examples from each width segment:
cat.pickle("imgs.pth") | apply(toImg(), 1) | ~apply(lambda x,y: [y,y|shape(0)]) | sort(1) | batchedTrigger(1, delta=60) | apply(unique(1) & iden() | joinSt() | head(5) | reverse()) | apply(T() | iden() + (count() | cut(1) | sort(None) | join(" - "))) | permute(1, 0) | apply(apply(tf.Resize(500)) | viz.Carousel() | toHtml(), 1) | apply(fmt.h, 0, level=3) | reverse() | viz.Carousel(searchMode=2)
Thought this img is cute, so im making it the thumbnail
# thumbnail
cat.pickle("imgs.pth") | apply(toImg(), 1) | ~apply(lambda x,y: [y,y|shape(0)]) | sort(1) | batchedTrigger(1, delta=60) | apply(unique(1) & iden() | joinSt() | head(5)) | rItem(3) | cut(0) | item()
Would be nice to have transcripts of the images:
%%time
cat.pickle("imgs.pth") | ~sortF(len, 1) | cut(1) | rItem(2) | toImg() | kapi.ocr(True, False)
CPU times: user 91.4 ms, sys: 153 ms, total: 244 ms Wall time: 1.44 s
<Ocr shape=(1079, 3860)>
Ok cool. Should take 15 minutes to get all transcripts:
#notest
cat.pickle("imgs.pth") | tee().autoInc() | applyMp(~aS(lambda x,y: [x, y | toImg() | kapi.ocr()]), timeout=3600, prefetch=10) | apply(dill.dumps) | file("transcripts.pth")
663) 663, 215s elapsed
'transcripts.pth'
#notest
cat.pickle("imgs.pth") | tee().autoInc() | applyMp(~aS(lambda x,y: [x, y | toImg() | kapi.ocr(True, False)]), timeout=3600, prefetch=10) | apply(dill.dumps) | file("transcripts.pth")
663) 663, 347s elapsed
'transcripts.pth'
Only takes 6 minutes! Wow that's faster than I thought!
Grab the gensokyo days only:
pg_ims = imgs | ~apply(lambda x,y: [x,y,y]) | lookup(cat.pickle("imgs.pth") | apply(toImg(), 1) | toDict(), 2) | deref() | k1.Wrapper()
pg_ims() | item() & shape() | deref()
[['/r/touhou/comments/100c9ui/happy_new_year_rtouhou/', 'https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec', <PIL.PngImagePlugin.PngImageFile image mode=P size=960x539>], (664, 3, 50)]
a = pg_ims() | filt(op() | shape(0) | (op()>800), 2) | groupBy(0, True) | apply(cut(0), 1) | deref(); a[:10]
[['/r/touhou/comments/100c9ui/happy_new_year_rtouhou/', ['https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec']], ['/r/touhou/comments/100khxk/130_age_old_question/', ['https://i.redd.it/rfk1bldjwf9a1.png']], ['/r/touhou/comments/101fbft/ex_008_the_strongest_fairy/', ['https://i.redd.it/u5h9j2h6fn9a1.png']], ['/r/touhou/comments/103460g/132_love_is_in_the_airwaves/', ['https://i.redd.it/duk5ymkd71aa1.png']], ['/r/touhou/comments/1040y8j/133_in_need_of_assistance/', ['https://i.redd.it/as386nlao8aa1.png']], ['/r/touhou/comments/104ufea/134_its_funnier_that_way/', ['https://i.redd.it/hk4fr1jzefaa1.png']], ['/r/touhou/comments/106j8gj/135_myon_in_the_mirror/', ['https://i.redd.it/16ck426nmtaa1.png']], ['/r/touhou/comments/107ebtp/136_just_an_average_day/', ['https://i.redd.it/qmbhf216r0ba1.png']], ['/r/touhou/comments/108d1k0/137_she_got_there_in_the_end/', ['https://i.redd.it/9zsevr71m8ba1.png']], ['/r/touhou/comments/1097ggt/138_maintenance/', ['https://i.redd.it/qwvdiu8hnfba1.png']]]
idxs = a | cut(0) | op().split("/")[5].split("_")[0].all() | toInt() | filt("x<500") | sort(None) | deref()
idxs | window(2) | ~apply(lambda x,y: y-x) | toSum() & shape(0) | deref()
[419, 410]
Very close by. Looks like we're only missing 9 pages out of 420 (nice). Cause it's going to be too large to fit in here, I'm just going to include the links to the imgs:
a | apply(op().split("/")[5].split("_")[0], 0) | toInt(0) | filt("x<500", 0) | sort(0) | apply(item(), 1) | ~apply(lambda idx, url: f"<a style='white-space: nowrap' target='_blank' href='{url}'>Episode {idx}</a>") | batched(20, True) | apply(fmt.col) | aS(fmt.row) | aS(IPython.display.HTML)
Ok cool! Well, looks like we're done here