From 280d75b52d6d54a0b6dd81d561970f61e68d8437 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 27 Jun 2023 19:22:37 +0800 Subject: [PATCH] perf: update GVLK fetch script --- gvlk/fetch.py | 72 +++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/gvlk/fetch.py b/gvlk/fetch.py index 3a07b16..a1e8921 100755 --- a/gvlk/fetch.py +++ b/gvlk/fetch.py @@ -9,69 +9,51 @@ LANG = yaml.full_load(open('config.yml').read())['lang'] URL = 'https://learn.microsoft.com/%s/windows-server/get-started/kms-client-activation-keys' -def extractKeys(items: list) -> dict: # detached from original html elements - def splitHeader(header) -> tuple[str, str]: - return header['id'], header.text - - def splitTable(table) -> dict: # split from html table - dat = {} - for item in [x for x in table.tbody if x.name == 'tr']: - name, key = item.select('td') - dat[str(name)[4:-5].replace('
', '\n')] = key.text - return dat - - result = {} - for index in range(len(items)): - if items[index].name == 'table': - keyContent = splitTable(items[index]) # GVLK content - keyId, keyName = splitHeader(items[index - 1]) - result[keyId] = { - 'name': keyName, - 'content': keyContent - } - return result - - -def fetchGvlk(lang: str) -> dict: # fetch GVLKs of the specified language +def fetchGvlks(lang: str) -> dict: # fetch GVLKs of the specified language request = requests.get(URL % lang, timeout = 15) request.raise_for_status() # only http-code 2xx request.encoding = 'utf-8' content = BeautifulSoup(request.text, 'lxml').select('.content')[0] # html parsing - result = [] - for element in content.children: - try: - if element['id'] == 'generic-volume-license-keys-gvlk': - result = [] # GVLK record begin - except: pass - if element.name in ['h3', 'h4', 'table']: # match target DOM - result.append(element) - return extractKeys(result) + items = [x for x in content.children if x.name in ['h2', 'h3', 'h4', 'table']] # match target DOMs + htmlIds = [x['id'] if 'id' in x.attrs else '' for x in items] + items = items[htmlIds.index('generic-volume-license-keys-gvlk'):] # located GVLKs section + + gvlks = {} + for index in range(len(items)): + if items[index].name == 'table': + header = items[index - 1] # last h3/h4 DOM + table = [x for x in items[index].tbody if x.name == 'tr'] # current table DOM + text = lambda x: str(x)[4:-5].replace('
', '\n') # extract DOM text + gvlks[header['id']] = { + 'name': header.text, # GVLKs title + 'content': { + text(x.select('td')[0]): x.select('td')[1].text for x in table # extract GVLKs + } + } + return gvlks -def combineGvlk(rawData: dict) -> dict: # merge multiple languages +def combineGvlks(rawData: dict) -> dict: # merge multiple languages firstVal = lambda x: list(x.values())[0] flipDict = lambda x: {v: k for k, v in x.items()} - def release(version: str) -> dict: + def combined(version: str) -> dict: keys = [x for _, x in firstVal(rawData)[version]['content'].items()] - gvlkItem = { + gvlksItem = { 'name': {lang: data[version]['name'] for (lang, data) in rawData.items()}, 'content': [{'name': {}, 'key': x} for x in keys] } for index in range(len(keys)): for (lang, data) in rawData.items(): data = flipDict(data[version]['content']) - gvlkItem['content'][index]['name'][lang] = data[keys[index]] - return gvlkItem + gvlksItem['content'][index]['name'][lang] = data[keys[index]] + return gvlksItem - result = {} - for gvlkVersion in list(firstVal(rawData)): - result[gvlkVersion] = release(gvlkVersion) - return result + return {x: combined(x) for x in list(firstVal(rawData))} if __name__ == '__main__': - gvlkData = combineGvlk({x: fetchGvlk(x) for x in LANG}) - with open('raw.json', 'w') as fp: # output as `raw.json` - fp.write(json.dumps(gvlkData, indent = 2, ensure_ascii = False) + '\n') + gvlksData = combineGvlks({x: fetchGvlks(x) for x in LANG}) + with open('raw.json', 'w') as fp: # output at `raw.json` + fp.write(json.dumps(gvlksData, indent = 2, ensure_ascii = False) + '\n')