hax01tips
注意:本文之后極有可能設為私有
題目

hax01
Your mission is the following: Simply enter a URL into the box. The domain of the URL must be or end with 'nasa.gov'. The URL will be fetched right away. The content returned should contain the string: "2200178118" in the first 10 Kbytes of data. 404/403/etc error pages are not accepted. Remember, do not do anything illegal. Make sure you type the right URL, do not guess.

Hint: google is your friend.
http://google.com/search?q=site:nasa.gov


當時我的思路是找出所有以nasa.gov結尾的域名,然后遍歷這些網址。之后我真的寫了個python程序,取了google檢索出來的前1000個頁面,取出域名,保存起來,去除重復的有500多個。接著,讀取html頁面,判斷是否還有字符串。其間,遇到了個網速的問題,超時后經常會跑到電信的114搜索上去。驗證了170多個頁面后,我發現自己理解錯題目了,這里的URL并不是指URL以nasa.gov結尾,而是指URL的域名以nasa.gov結尾。我無語了,這相當于域名下的所有網頁都有可能。這個工作量巨大得幾乎是不可能的。暫時中止。
以下代碼可供參考,修改了n次,可能現在已經沒法直接運行。

2.5
1
from urllib import FancyURLopener
2
import urllib2
3
import sys
4
import re
5
import locale
6
"""
7
class MyOpener(FancyURLopener):
8
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10
res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12
myopener = MyOpener()
13
url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14
li = []
15
for i in range(0, 10):
16
url = url + '&start=' + str(i*100)
17
page = myopener.open(url)
18
str1 = page.read()
19
for aItem in res.findall(str1):
20
if not aItem[0] in li:
21
li.append(aItem[0])
22
"""
23
with open('nasa.txt') as li:
24
#li = open('nasa.txt')
25
#print li.count
26
m = 0
27
for a in li:
28
#print 'http://'+a
29
m = m + 1
30
print m
31
url = a
32
req = urllib2.Request(url)
33
try:
34
response = urllib2.urlopen(req)
35
the_page = response.read()
36
with open(url + '.txt') as nasa:
37
write(the_page)
38
if the_page.find(r'daohang.118114.cn') <> -1 :
39
print '114'
40
elif the_page.find('2200178118', 0, 10240) <> -1 :
41
print url
42
else :
43
print '
'
44
except urllib2.URLError, e:
45
print e.reason
46
47
"""
48
#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
49
#google = urllib.urlopen(gUrl)
50
#str = google.read()
51
for str in open('sitenasa_gov.htm'):
52
for aItem in res.findall(str):
53
print aItem[0]
54
55
#print str
56
str = 'www.xxx.nasa.gov/wwf.nasa.gov'
57
58
"""
59
60
61
#2200178118
62

3.1
1
from urllib.request import FancyURLopener
2
import urllib
3
import sys
4
import re
5
import locale
6
"""
7
class MyOpener(FancyURLopener):
8
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10
res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12
myopener = MyOpener()
13
url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14
li = []
15
for i in range(0, 10):
16
url = url + '&start=' + str(i*100)
17
page = myopener.open(url)
18
str1 = page.read()
19
for aItem in res.findall(str1):
20
if not aItem[0] in li:
21
li.append(aItem[0])
22
"""
23
fiPath = sys.argv[1]
24
with open(fiPath) as li:
25
#li = open('nasa.txt')
26
#print li.count
27
m = 0
28
for a in li:
29
#print 'http://'+a
30
m = m + 1
31
#print m
32
url = a
33
req = urllib.request.Request(url)
34
try:
35
response = urllib.request.urlopen(req)
36
the_page = response.read()
37
with open(url[7:-1] + '.txt', 'wb') as nasa:
38
nasa.write(the_page)
39
nasa.flush()
40
if the_page.decode('utf8').find(r'icc.qonc.com') != -1:
41
print('114')
42
elif the_page.decode('utf8').find('2200178118', 0, 10240) != -1:
43
print(url)
44
else :
45
print('
')
46
except urllib.error.URLError as e:
47
print(e.code)
48
except UnicodeDecodeError as UDE:
49
print(UDE)
50
51
"""
52
#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
53
#google = urllib.urlopen(gUrl)
54
#str = google.read()
55
for str in open('sitenasa_gov.htm'):
56
for aItem in res.findall(str):
57
print aItem[0]
58
59
#print str
60
str = 'www.xxx.nasa.gov/wwf.nasa.gov'
61
62
"""
63
64
65
#2200178118
66
而后,過了大概幾個月,變換思路,解決,意外的簡單……
事實上,只要向服務器提交數據,一般服務器也會將該數據返回到頁面上。該題最后的hint不是讓我們來搜該域名,而是告訴我們怎樣在google.com的頁面上顯示我們想要的數據。譬如
http://www.google.co.jp/search?q=2200178118 該頁面的前10K里應該包含了該字符串。接下來,我們只需要在nasa.gov上找個頁面提交數據就行了。
over