from k1lib.imports import *

/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

2023-12-04 02:01:36,528	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-04 02:01:36,535	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265


url = "https://www.reddit.com/user/BlackDragonTribe/"


b = zircon.newBrowser()
await b.scan("mint3") | shape()

Connecting to server...
Connected

(10, 28)


await b.pickExt(await b.scan("mint3") | grep("_30") | item())

'ok'


await b.goto("https://www.reddit.com/user/BlackDragonTribe/")


e = await b.querySelector(".rpBJOHq2PR60pnwJlUyP0"); e

<Element selector='.rpBJOHq2PR60pnwJlUyP0' browser=_ext_158826568_1701652845_30>


posts = await e.children() | k1.Wrapper()


await (posts() | op().value("outerHTML").all() | deref() | ~aS(asyncio.gather)) | k1.Wrapper() | aS(dill.dumps) | file("postBodies.pth")

'postBodies.pth'


from k1lib.imports import *; import bs4
postBodies = cat.pickle("postBodies.pth") | item()

/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

2023-12-03 21:42:05,516	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-03 21:42:05,523	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265


with k1.ignoreWarnings(): postBs = postBodies() | apply(bs4.BeautifulSoup) | deref(1) | k1.Wrapper()
def refinePost(postB):
    href = postB.find_all("a") | filt(op().find_all("h3")) | item() | op().attrs["href"]
    title = postB.find_all("h3") | item()
    return [title, href]
postMeta = postBs() | apply(refinePost) | filt(op().startswith("/r/touhou"), 1) | deref() | k1.Wrapper()
postMeta() | display() & shape() | deref()

['421: "The first, but not the last"']      /r/touhou/comments/189swmw/421_the_first_but_not_the_last/       
['420: "Down the rabbit hole"']             /r/touhou/comments/1894ce9/420_down_the_rabbit_hole/             
['long hair Nazrin']                        /r/touhou/comments/188se3i/long_hair_nazrin/                     
['419: "The anticipation is killing me"']   /r/touhou/comments/188djoy/419_the_anticipation_is_killing_me/   
['418: "Who didn\'t see that coming"']      /r/touhou/comments/187izd5/418_who_didnt_see_that_coming/        
['Gym Reimu']                               /r/touhou/comments/187cut6/gym_reimu/                            
['retro.exe (shart)']                       /r/touhou/comments/17yhgkv/retroexe_shart/                       
['417: "The most important thing"']         /r/touhou/comments/186rsbn/417_the_most_important_thing/         
['158: Yakumo Family Dynamics']             /r/touhou/comments/186jy94/158_yakumo_family_dynamics/           
['bed hair']                                /r/touhou/comments/186lgpg/bed_hair/

[None, (654, 2, 1, 34)]


b = zircon.newBrowser()
await b.pickExt(await b.scan("mint3") | item())

Connecting to server...
Connected

'ok'


await b.scan(["mint2", "mint3"]) | shape()

(30, 28)


await b.goto("/r/touhou/comments/1894ce9/420_down_the_rabbit_hole/")

{}


bs4.BeautifulSoup(await (await b.querySelector("body")).value("innerHTML")).find_all("img") | op().attrs.get("src", None).all() | filt("x") | filt(op().startswith("http")) | deref()

['https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png',
 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png',
 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_2.png',
 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_0.png',
 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_1.png',
 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png',
 'https://i.redd.it/jw2x7zo81w3c1.png',
 'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png',
 'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png?width=256&s=c45b37c7ea9db1e4403635d410faf12a57b741bc']


b"" | file("gd.pth")

'gd.pth'


goingToVisit = postMeta() | cut(1) | deref() | aS(deque) | k1.Wrapper()
async def crawl(b:"zircon.Browser"):
    if len(goingToVisit()) == 0: raise zircon.BrowserCancel()
    url = goingToVisit().popleft()
    try:
        startTime = time.time()
        await b.goto(f"https://www.reddit.com{url}", 60)
        title = await (await b.querySelector("title")).value("innerHTML")
        body = await (await b.querySelector("body")).value("innerHTML")
        imgs = bs4.BeautifulSoup(body).find_all("img") | op().attrs.get("src", None).all() | filt("x") | filt(op().startswith("http")) | deref()
        # below is guaranteed to work and will not be interrupted
        [startTime, time.time(), url, title, imgs] | aS(dill.dumps) >> file("gd.pth")
    except Exception as e: goingToVisit().append(url)
bg = zircon.BrowserGroup(["mint2", "mint3"], 100)
await bg.execute(crawl, 30)

Executing. #browsers=30 #running=0 #tasks=0  Task finished


from k1lib.imports import *

/home/kelvin/anaconda3/envs/ray2/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.0
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

2023-12-04 01:14:53,215	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-04 01:14:53,223	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265


cat.pickle("gd.pth") | item() & shape() | deref()

[[1701657743.621378,
  1701657752.7771413,
  '/r/touhou/comments/17y8f13/148_based_meme/',
  '148: Based Meme : touhou',
  ['https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png',
   'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png',
   'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_2.png',
   'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_1.png',
   'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_7.png',
   'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png',
   'https://preview.redd.it/nfiadn3ig41c1.png?width=640&crop=smart&auto=webp&s=3730200dd1caf0ac3056cb5a7eff61d248c5caae',
   'https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png',
   'https://styles.redditmedia.com/t5_2qvi5/styles/communityIcon_05i75cvsrswb1.png?width=256&s=c45b37c7ea9db1e4403635d410faf12a57b741bc']],
 (641, 5)]


cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | reverse() | item() | op()/60

4.252806413173675


fig, axes = plt.subplots(1, 2, figsize=(10, 4))
plt.sca(axes[0]); cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | window(2) | ~apply(lambda x,y: y-x) | filtStd() | filtStd() | deref() | aS(plt.hist, bins=30)
plt.xlabel("Time between 2 consecutive page scans (s)"); plt.ylabel("Frequency");
plt.sca(axes[1]); cat.pickle("gd.pth") | cut(0, 1) | ~apply(lambda x,y: y-x) | deref() | aS(plt.hist, bins=30)
plt.xlabel("Time to extract all data out of the page (s)"); plt.ylabel("Frequency"); plt.tight_layout()


fig, axes = plt.subplots(1, 2, figsize=(10, 4))
plt.sca(axes[0]); cat.pickle("gd.pth") | cut(1) | sort(None) | zeroes() | window(2) | ~apply(lambda x,y: y-x) | insertIdColumn() | T() | smooth(10).all() | deref() | ~aS(plt.plot)
plt.xlabel("#pages"); plt.ylabel("Time between 2 consecutive scans (s)"); plt.grid(True)
plt.sca(axes[1]); cat.pickle("gd.pth") | cut(0, 1) | ~apply(lambda x,y: y-x) | insertIdColumn() | T() | smooth(10).all() | deref() | ~aS(plt.plot)
plt.xlabel("#pages"); plt.ylabel("Time to extract all data out of the page (s)"); plt.grid(True); plt.tight_layout()


cat.pickle("gd.pth") | cut(4) | shape(0).all() | deref() | aS(plt.hist, bins=30);
plt.xlabel("#images/page"); plt.ylabel("Frequency");


cat.pickle("gd.pth") | cut(4) | joinSt() | shape(0)

8450


cat.pickle("gd.pth") | cut(4) | joinSt() | aS(set) | shape(0)

693


cat.pickle("gd.pth") | shape()

(641, 5)


imgs = cat.pickle("gd.pth") | cut(2, 4) | ungroup() | groupBy(1, True) | apply(joinSt(), 1) | deref() | filt("len(x) == 1", 1) | ungroup() | permute(1, 0) | sort(0, False) | deref()
imgs | display(20)

/r/touhou/comments/100c9ui/happy_new_year_rtouhou/         https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec   
/r/touhou/comments/100khxk/130_age_old_question/           https://i.redd.it/rfk1bldjwf9a1.png                                                                                   
/r/touhou/comments/101fbft/ex_008_the_strongest_fairy/     https://i.redd.it/u5h9j2h6fn9a1.png                                                                                   
/r/touhou/comments/103460g/132_love_is_in_the_airwaves/    https://i.redd.it/duk5ymkd71aa1.png                                                                                   
/r/touhou/comments/1040y8j/133_in_need_of_assistance/      https://i.redd.it/as386nlao8aa1.png                                                                                   
/r/touhou/comments/104ufea/134_its_funnier_that_way/       https://i.redd.it/hk4fr1jzefaa1.png                                                                                   
/r/touhou/comments/106j8gj/135_myon_in_the_mirror/         https://i.redd.it/16ck426nmtaa1.png                                                                                   
/r/touhou/comments/107ebtp/136_just_an_average_day/        https://i.redd.it/qmbhf216r0ba1.png                                                                                   
/r/touhou/comments/108d1k0/137_she_got_there_in_the_end/   https://i.redd.it/9zsevr71m8ba1.png                                                                                   
/r/touhou/comments/1097ggt/138_maintenance/                https://i.redd.it/qwvdiu8hnfba1.png                                                                                   
/r/touhou/comments/10axlb6/139_i_always_feel_like/         https://i.redd.it/e54myusdztba1.png                                                                                   
/r/touhou/comments/10bofy6/140_multiplying/                https://i.redd.it/1rkn7r7eg0ca1.png                                                                                   
/r/touhou/comments/10dfmi2/141_playing_both_sides/         https://i.redd.it/bpszik1tyeca1.png                                                                                   
/r/touhou/comments/10efe6l/142_marital_issues/             https://i.redd.it/xvxxmbsnjmca1.png                                                                                   
/r/touhou/comments/10f8gsx/143_sky_high/                   https://i.redd.it/nsp2iuk29tca1.png                                                                                   
/r/touhou/comments/10g2ulf/144_masterpiece/                https://i.redd.it/bc8qzt71e0da1.png                                                                                   
/r/touhou/comments/10h0n3h/145_burning_desire/             https://i.redd.it/jszdy82q18da1.png                                                                                   
/r/touhou/comments/10hv6lp/146_remodel/                    https://i.redd.it/izj22d2sbfda1.png                                                                                   
/r/touhou/comments/10k7pwk/148_moon_with_a_view/           https://i.redd.it/6bz5ic2jc0ea1.png                                                                                   
/r/touhou/comments/10kymz4/149_doctor_is_in/               https://i.redd.it/b3bh41poy6ea1.png


imgs | cut(1) | iden() & applyMp(tryout() | cat(text=False), timeout=60, prefetch=10) | transpose() | tee().autoInc() | apply(dill.dumps) | file("imgs.pth")

663) 663, 56s elapsed

'imgs.pth'


whs = cat.pickle("imgs.pth") | apply(toImg() | shape(), 1) | deref() | k1.Wrapper()
whs() | shape()

(664, 2, 115)


whs() | cut(1) | T() | (sketch(['plt.xlabel("Value")', 'plt.ylabel("Frequency")', 'plt.yscale("log")'], ["Width", "Height"]) | deref() | aS(plt.hist, bins=30))


cat.pickle("imgs.pth") | apply(toImg(), 1) | ~apply(lambda x,y: [y,y|shape(0)]) | sort(1) | batchedTrigger(1, delta=60) | apply(unique(1) & iden() | joinSt() | head(5) | reverse()) | apply(T() | iden() + (count() | cut(1) | sort(None) | join(" - "))) | permute(1, 0) | apply(apply(tf.Resize(500)) | viz.Carousel() | toHtml(), 1) | apply(fmt.h, 0, level=3) | reverse() | viz.Carousel(searchMode=2)


# thumbnail
cat.pickle("imgs.pth") | apply(toImg(), 1) | ~apply(lambda x,y: [y,y|shape(0)]) | sort(1) | batchedTrigger(1, delta=60) | apply(unique(1) & iden() | joinSt() | head(5)) | rItem(3) | cut(0) | item()


%%time
cat.pickle("imgs.pth") | ~sortF(len, 1) | cut(1) | rItem(2) | toImg() | kapi.ocr(True, False)

CPU times: user 91.4 ms, sys: 153 ms, total: 244 ms
Wall time: 1.44 s

<Ocr shape=(1079, 3860)>


#notest
cat.pickle("imgs.pth") | tee().autoInc() | applyMp(~aS(lambda x,y: [x, y | toImg() | kapi.ocr()]), timeout=3600, prefetch=10) | apply(dill.dumps) | file("transcripts.pth")

663) 663, 215s elapsed

'transcripts.pth'


#notest
cat.pickle("imgs.pth") | tee().autoInc() | applyMp(~aS(lambda x,y: [x, y | toImg() | kapi.ocr(True, False)]), timeout=3600, prefetch=10) | apply(dill.dumps) | file("transcripts.pth")

663) 663, 347s elapsed

'transcripts.pth'


pg_ims = imgs | ~apply(lambda x,y: [x,y,y]) | lookup(cat.pickle("imgs.pth") | apply(toImg(), 1) | toDict(), 2) | deref() | k1.Wrapper()
pg_ims() | item() & shape() | deref()

[['/r/touhou/comments/100c9ui/happy_new_year_rtouhou/',
  'https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec',
  <PIL.PngImagePlugin.PngImageFile image mode=P size=960x539>],
 (664, 3, 50)]


a = pg_ims() | filt(op() | shape(0) | (op()>800), 2) | groupBy(0, True) | apply(cut(0), 1) | deref(); a[:10]

[['/r/touhou/comments/100c9ui/happy_new_year_rtouhou/',
  ['https://preview.redd.it/xghhdfs92d9a1.png?width=960&crop=smart&auto=webp&s=3e0e3683a8e4d6b5541008bb497a1867144fbcec']],
 ['/r/touhou/comments/100khxk/130_age_old_question/',
  ['https://i.redd.it/rfk1bldjwf9a1.png']],
 ['/r/touhou/comments/101fbft/ex_008_the_strongest_fairy/',
  ['https://i.redd.it/u5h9j2h6fn9a1.png']],
 ['/r/touhou/comments/103460g/132_love_is_in_the_airwaves/',
  ['https://i.redd.it/duk5ymkd71aa1.png']],
 ['/r/touhou/comments/1040y8j/133_in_need_of_assistance/',
  ['https://i.redd.it/as386nlao8aa1.png']],
 ['/r/touhou/comments/104ufea/134_its_funnier_that_way/',
  ['https://i.redd.it/hk4fr1jzefaa1.png']],
 ['/r/touhou/comments/106j8gj/135_myon_in_the_mirror/',
  ['https://i.redd.it/16ck426nmtaa1.png']],
 ['/r/touhou/comments/107ebtp/136_just_an_average_day/',
  ['https://i.redd.it/qmbhf216r0ba1.png']],
 ['/r/touhou/comments/108d1k0/137_she_got_there_in_the_end/',
  ['https://i.redd.it/9zsevr71m8ba1.png']],
 ['/r/touhou/comments/1097ggt/138_maintenance/',
  ['https://i.redd.it/qwvdiu8hnfba1.png']]]


idxs = a | cut(0) | op().split("/")[5].split("_")[0].all() | toInt() | filt("x<500") | sort(None) | deref()
idxs | window(2) | ~apply(lambda x,y: y-x) | toSum() & shape(0) | deref()

[419, 410]


a | apply(op().split("/")[5].split("_")[0], 0) | toInt(0) | filt("x<500", 0) | sort(0) | apply(item(), 1) | ~apply(lambda idx, url: f"<a style='white-space: nowrap' target='_blank' href='{url}'>Episode {idx}</a>") | batched(20, True) | apply(fmt.col) | aS(fmt.row) | aS(IPython.display.HTML)

Initial scanning phase¶

Preparing phase¶

Crawl¶

Crawl analysis¶

1354 - 1360 - 1369 - 1373 - 1376

960 - 1000 - 1024 - 1079 - 1100

850

640

256