Wikipedie:GPT vypisovač shrnutí diskusí z interwiki
Vzhled
Následující program v Pythonu 3 je založený na umělé inteligenci. Projde interwiki zadaného článku Wikipedie (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní) a vypíše tabulku s přehledem a shrnutím všech diskusních vláken na všech interwiki daného článku.
Protože program přes API provolává model řady GPT, je potřeba mít na stránkách firmy OpenAI zakoupeno právo model takto používat (https://openai.com/api/).
import requests
import re
import csv
import json
from openai import OpenAI
from collections import defaultdict
client = OpenAI(api_key=my_api_key) # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu
"""
Program shrne obsah diskusních stránek článku a jeho interwiki
"""
# vstupy
article_title = "John Woo" # jméno vstupního článku v uvozovkách
language = "cs" # zkratka Wikipedie, na které je umístěn
outlang = "Czech" # jazyk, ve kterém požadujeme shrnutí
def get_interwiki(title, language='en', include_article_language = True):
"""Creates a list of interwikis for a given article"""
url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={title}&lllimit=500"
response = requests.get(url)
data = response.json()
interwiki = []
if include_article_language:
interwiki = [{'language': language, 'article': title}]
page_id = list(data['query']['pages'].keys())[0]
if 'langlinks' in data['query']['pages'][page_id]:
langlinks = data['query']['pages'][page_id]['langlinks']
for link in langlinks:
interwiki.append({"language": link['lang'], "article": link['*']})
return interwiki
def get_discussion_page_source(title, language):
url = f"https://{language}.wikipedia.org/w/api.php"
params = {
"action": "query",
"prop": "revisions",
"titles": f"Talk:{title}",
"rvslots": "*",
"rvprop": "content",
"format": "json"
}
response = requests.get(url, params=params)
if response.status_code != 200:
return ""
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page_data in pages.items():
if "revisions" in page_data:
return page_data["revisions"][0]["slots"]["main"]["*"]
return ""
def split_chapters(wikitext):
"""
Splits a source of a Wikipedia article into chapters of == level 2 ==, creates a dictionary of them
"""
# Splitting text into lines for easier processing
lines = wikitext.split('\n')
# Regex patterns to identify headings
# heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)$')
heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)')
# Dictionary to store chapter information
chapters = defaultdict(lambda: {"Text": "", "Length": 0, "Heading": ""})
# Initial variables to keep track of current chapter and its content
current_chapter = 0
chapter_text = []
chapter_heading = ""
for line in lines:
heading_match = heading_pattern.match(line)
if heading_match:
# If we reach a new heading, save the previous chapter's information
if chapter_text or chapter_heading:
chapters[current_chapter]["Text"] = "\n".join(chapter_text)
chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
chapters[current_chapter]["Heading"] = chapter_heading
# Determine the chapter number by heading level
level = len(heading_match.group(1))
if level == 2:
current_chapter += 1 if chapters[current_chapter]["Text"] else 0
chapter_heading = heading_match.group(2).strip()
chapter_text = [line]
elif level > 2:
chapter_text.append(line)
else:
# Add line to current chapter text
chapter_text.append(line)
# Save the last chapter's information
if chapter_text or chapter_heading:
chapters[current_chapter]["Text"] = "\n".join(chapter_text)
chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
chapters[current_chapter]["Heading"] = chapter_heading
return dict(chapters)
def shrn_diskusi(chapters, outlang="Czech"):
"""
Vstup: dictionary z programu split_chapters a outlang = jazyk, jakým má psát. Vytvoří seznam dictionaries
"Item" (číslo vlákna od nuly), "Heading" (nadpis kapitoly), "Translated_heading" (dtto v outlang),
"Summary" (shrnutí), "Start_date" (datum zahájení diskuse)
"""
outlist = []
json_pattern = re.compile(r'{.*?}')
for i in range(len(chapters)):
ch = chapters[i]
tx = ch['Text'][:3000].replace('"', "'").replace('{', "").replace('}', "")
hd = ch['Heading'].replace('"', "'").replace('[', "").replace(']', "")
summary = chapter_summary(tx, hd, outlang) # call GPT to translate and summarize
json_match = re.search(r'{.*?}', summary, re.DOTALL)
if json_match:
json_substring = json_match.group()
try:
response = json.loads(json_substring) # Parse the JSON substring to a Python dictionary
except json.JSONDecodeError as e:
print(str(i) + "*** Error: nenaparsovano ***" + json_substring + "\n")
response = {}
else:
print(str(i) + "*** Error: nevznikl JSON ***" + summary + "\n")
response = {}
response['Item'] = i
response['Heading_orig'] = ch['Heading']
response['Text_orig'] = ch['Text']
response['Summary_output'] = summary
if len(str(ch['Heading']).strip()) == 0:
response['Translated_heading'] == "N/A"
outlist.append(response)
return outlist
def chapter_summary(wikitext, heading, outlang):
"""Z textu vlákna diskuse (wikitext) a jeho nadpisu (heading) v jazyce outlang vyrobí JSON,
kde budou položky "Translation" (nadpis kapitoly v outlang), "Start_date" a "Summary" (shrnutí)"""
#print("*** Heading *** "+heading)
head = heading
if head == "":
head = "N/A"
prompt = f"The goal is to process a Wikipedia discussion: translate its heading into the {outlang} language \
and sumarize the discussion in the same {outlang} language. The original heading is {head}. \
You will create a JSON (within curly brackets) with three items: \
- 'Translated_heading', which is the heading in {outlang}, \
- 'Start_date', which is the earliest date of a signature in the discussion (in {outlang}), and \
- 'Summary', which is the summary of the content of the discussion, written in {outlang}. \
If the content of the discussion are only templates and/or tables and/or bot edits, 'Translation' \
is 'N/A' and 'Summary' is empty. \
Do not use square brackets, curly brackets or apostrophes in the output except for the purpose of JSON. \
The discussion is here: \
{wikitext}"
response = client.chat.completions.create(
model="gpt-3.5-turbo", # levnější než gpt-4o
max_tokens=4096,
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a JSON generator. Your output can be only a JSON (within curly brackets {}). Otherwise write nothing."
}
]
},
{
"role": "user",
"content": prompt
},]
)
msgtext = response.choices[0].message.content
return msgtext
def iw_summaries(article_title, language="cs", outlang="Czech"):
"""v jazyce outlang vypíše shrnutí diskusí na všech iw článku article_title z Wikipedie language"""
iws = get_interwiki(title=article_title, language=language)
outlist = []
for iw in iws:
disc = get_discussion_page_source(title=iw["article"], language=iw["language"])
if len(disc) > 30:
diskuse = split_chapters(get_discussion_page_source(title=iw["article"], language=iw["language"]))
data = shrn_diskusi(diskuse, outlang=outlang)
for item in data:
item["Language"] = iw["language"]
if item["Heading_orig"] == "":
item["Article_name"] = "[[:"+iw["language"]+":Talk:"+iw["article"]+"]]"
else:
item["Article_name"] = "[[:"+iw["language"]+":Talk:"+iw["article"]+"#"+item["Heading_orig"]+"]]"
outlist += data
return outlist
def dictlist_to_wikitable(dict_list, drop_columns=[]):
"""
Converts a list of dictionaries to a Wikimedia wiki table markup.
Args:
dict_list (list): A list of dictionaries, all with the same structure.
drop_columns: list of columns to drop from the output table
Returns:
str: The Wikimedia wiki table markup.
"""
# Get the keys from the first dictionary
keys = list(dict_list[0].keys())
keys = [key for key in keys if key not in drop_columns] # drop those not reported
# Start the table markup
table = "{| class=\"wikitable sortable mw-collapsible mw-collapsed\"\n"
# Add the header row
table += "! " + " !! ".join(keys) + "\n"
# Add the data rows
for d in dict_list:
row = "|-\n| "
for k in keys:
try:
value = str(d[k])
except Exception:
value = ""
if not k in drop_columns:
row += value + " || "
table += row[:-4] + "\n" # Remove the last " || "
# Close the table markup
table += "|}"
return table
diskuse = iw_summaries(article_title, language=language, outlang=outlang)
print(dictlist_to_wikitable(diskuse, drop_columns=["Heading_orig", "Text_orig", "Summary_output"]))