Wikipedie:GPT vypisovač informace, která není obsažena v článku, ale je v interwiki
Následující program v Pythonu 3 je založený na umělé inteligenci. Zdáte krátky článek české Wikipedie, který má interwiki (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní). Program v bodech vypíše až dvanáct nejdůležitějších informací, které v interwiki nalezl a jež by mohly článek obohatit. Vyžaduje přístup k placeným API firmy OpenAI, což je potřeba si zakoupit na jejich webu.
import requests
import re
import time
from bs4 import BeautifulSoup
from openai import OpenAI
client = OpenAI(api_key=my_api_key) # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu
# Vstupy:
article_name = "Dřevina" # jméno článku v uvozovkách
language = "cs" # Wikipedie
n = 3 # s kolika články ho chceme porovnat
def get_article_length(article_name, language='en'):
"""How long is a given article"""
url = f"https://{language}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={article_name}&exlimit=1&explaintext"
response = requests.get(url)
data = response.json()
page_id = list(data['query']['pages'].keys())[0]
if 'extract' in data['query']['pages'][page_id]:
article_text = data['query']['pages'][page_id]['extract']
article_length = len(article_text)
return article_length
return None
def get_interwiki(article_name, language='en', n=3, include_article_language = False):
"""Creates a list of n interwikis for a given article which have the most developed articles"""
url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={article_name}&lllimit=500"
response = requests.get(url)
data = response.json()
interwiki = []
if include_article_language:
interwiki = [{'language': language, 'article': article_name,
'length': get_article_length(article_name, language)}]
page_id = list(data['query']['pages'].keys())[0]
if 'langlinks' in data['query']['pages'][page_id]:
langlinks = data['query']['pages'][page_id]['langlinks']
for link in langlinks:
interwiki.append({"language": link['lang'], "article": link['*'],
"length": get_article_length(link['*'], link['lang'])})
return sorted(interwiki, key=lambda x: x['length'], reverse=True)[:n] # only n iw with longest articles
def wiki_plain_text(article_name, language='en'):
"""List of texts of chapters of the article"""
url = f"https://{language}.wikipedia.org/wiki/{article_name}"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for unwanted in soup(["table", "figure", "sup"]): # remove unwanted tag elements
unwanted.extract()
paras = []
article_text = ''
pridej = True
for header in soup.find(id='mw-content-text').find_all(["h2", "h3", "h4", "p", "ul", "ol", "li"]):
headertext = re.sub(r'\[.*?\]', '', header.text.strip())
headertext = headertext.replace('\xa0', ' ')
delka = len(headertext)
if header.name == "p":
article_text += re.sub('\n', ' ', headertext) + "\n"
if delka > 50:
pridej = True
else:
if pridej:
paras.append(article_text)
article_text = headertext+'\n'
pridej = False
if pridej:
paras.append(article_text)
return [item for item in paras if item.strip() != ""] # Only items which are not empty
def raw_info(article_name, language='en', n=3):
"""List of texts of chapters of n most developed articles = the basis for the reasoning about the subject"""
interwiki = get_interwiki(article_name, language, n)
print("*** Source languages: ")
for i in interwiki:
print(' ' + i['language'])
orig = wiki_plain_text(article_name, language)
info = [orig[0]] # Start with the basic definition in the original language
for iw in interwiki: # Add chapters of the selected interwikis
info += wiki_plain_text(iw['article'], iw['language'])
return info
def info_bits(article_name, language='en', n=3):
"""Creates bits of information about the subject and evaluates their importance"""
info = raw_info(article_name, language, n)
print("*** Chapters read: "+ str(len(info) - 1))
defined = info.pop(0)
prompt = f"We will create list of fact about {article_name}, which Wikipedia describes as: {defined}"
prompt += f"Take the following text and extracts facts about {article_name} from it. "
prompt += "The output will have one fact on one line. There will be short statement of the fact, "
prompt += f"semicolon, and evaluation of the importance of the fact for understanding of {article_name} "
prompt += """on the scale 'Very important', 'Important', 'Less important', 'Unimportant'.
Example of one line of the output: \nThe king had no daughters. ; Unimportant \n """
prompt += f"Write in the language of the original text. The text about {article_name} is: "
facts = ''
for chapter in info: # Process chapters one by one
response = client.chat.completions.create(
model="gpt-3.5-turbo-16k", # "gpt-3.5-turbo-16k" or "gpt-4-turbo"
max_tokens=1200,
messages=[
{
"role": "user",
"content": prompt + chapter
},]
)
msgtext = response.choices[0].message.content
if msgtext is not None:
facts += msgtext
facts += "\n"
else:
print("**No message Prompt** "+ prompt + chapter)
print("**No message Response** "+str(response))
#time.sleep(1)
return facts
def final_list(article_name, language='en', n=3):
"""Compares a Wikipedia article with the information extracted from other interwiki articles"""
info = info_bits(article_name, language, n) # info from the most developed articles
plaintext = ' '.join(wiki_plain_text(article_name, language)) # the analyzed article
prompt = f"The goal is to prepare a list of important fact which are not mentioned in an article \
about {article_name} and can enhance it. \
Therefore write the output in the language of the article, which is {language}. \
Start by comparing the article about {article_name} with a list of facts about the same subject. \
The facts are followed by an importance estimation, but this estimation may be wrong. \
The facts may be written in different languages, \
but you will use {language} in the output. Here is **the article**:\n"
prompt += plaintext
prompt += "\nAnd here is **the list of facts**:\n"
prompt += info
prompt += f"\nSome of the facts may repeat in the list. Create a new numbered list od facts about \
{article_name}, this time without repetition. Each fact will be on a separate line. \
Drop the importance estimation. Mention only those fact which are substantial and not already mentioned \
in the article. Mention at most twelve most important facts. Sort the facts in order if importance. \
Write in the language of the article, which is {language}. Drop all the facts which are contained \
in the article you have read first."
response = client.chat.completions.create(
model="gpt-4o", # "gpt-3.5-turbo-16k" or "gpt-4-turbo"
max_tokens=2200,
messages=[
{
"role": "user",
"content": prompt
},]
)
msgtext = response.choices[0].message.content
return [msgtext, info, plaintext]
info = final_list(article_name, language, n)
print(info[0])