AI副業：ワードクラウド画像を作成したいです。

「さはら３」です。

AI副業でどこまでいけるのか？をテーマに頑張っていきたいと思います。

いつも読んで頂きありがとうございます。

頭の体操
テーマ
頭の体操：解答
本日のAI着物美女

頭の体操

１か所だけ異なる漢字が入っています。（解答は一番最後に掲載）

テーマ

さて、今回のテーマは「ワードクラウド画像」です。

ワードクラウド画像とはなんぞや？

この様な、テキストをマイニングし、画像化したモノを指します。

上記ワードクラウド画像は、本ブログのワードを参考に作成しました。（もちろんChatGPTさんのおかげです。）

スクリプト

生成できたスクリプトは以下です。

import requests
from bs4 import BeautifulSoup
from collections import Counter
import re
import xml.etree.ElementTree as ET
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 対象のURL
url = "https://ai-sahara-navi.com/rss"

# ファイルの保存先
file_path = r"C:\Users\any\Downloads\ai-sahara-navi.com_rss.xml"

# URLからデータを取得
response = requests.get(url)
response.raise_for_status()  # エラーが発生した場合は例外を発生させる

# データをファイルに保存
with open(file_path, 'wb') as file:
    file.write(response.content)

# XMLファイルを読み込む
tree = ET.parse(file_path)
root = tree.getroot()

# 'item'要素内の'link'要素を抜き出す
urls = [item.find('link').text for item in root.findall(".//item")]

# ストップワードのリスト
stop_words = set([
    # 既存のストップワード
    'さはら３', 'sahara3', 'aiさはら', 'dp', '6', '82', 'java', 'sort', 'a3', 'と', '81', 'range', 'd7', 'c0', 'hatenablog', 'b8', '1200', 'eb', '512', 'b8', '1200', 'eb', '512', '768', 'table',
    'of', 'contents', 'co', 'ab', '83', 'ff0000', 'テクノロジー', 'a2', 'c9', 'af', '9', 'で', 'rel', 'noopener', '11696248318754550877', 'align', 'right', 'i', 'main',
    'panel', 'cite', 'ff5252', '0', 'y', 'target', '_blank', 'b', '公式', '2023年開設ブログ', 'b3', 'b0', '2f1672022764175731', '2f62150696', '2fcircle_image', 'user', 
    'faa1264c227008e8b759458790977cdaf6601b23', '4207112889948320640', '10', 'asin', 'detail', 'count', 'b9', 'c6', 'circle', '2fcomputers', 'gif', 'p', 'span',
    'a', 'class', 'strong', 'https', 'li', 'href', 'keyword', 'hatena', 'code', 'div', 'com', 'jp', 'group', 'A5', 'ne', 'embed', 'style', 'd', 'ul', 'id', 'ai', 'sahara', 'navi', 'h3', 'rss',
    '1', 'synConstant', 'color', 'title', 'image', 'figure', 'height', 'h4', 'width', 'fotolife', 'font', 'pre', 'size', '150', 'a5', 'synconstant', 'st', 'link', 'blog', '2', '80', '40', 'br',
    'img', 'src', 'cdn', 'syncomment', '3', '2196f3', 'synidentifier', 'figcaption', 'hr', 'icon', 'quot', 'print', 'www', 'text', '4', 'dd830c', '5', 'lang', 'a4', 'alt', 'label', 'h2', 'png',
    'version', 'synstatement', 'js', 'redirect', 'square', 'backend', 'imagemagick', '3a', '2f', '2fcdn', 'content', 'ランキング参加中', 'images', 'f', 'my_list', 'blockquote', 'ak', 
    's', 'x', 'markdown', 'です', 'bc', 'e3', 'dir', 'ltr', 'f3', 'bb', 'decoration', 'underline', 'c8', 'a1', 'amp', 'mcenoneditable', 'mceeditable', 'e9', 'bd', 'ol', 'ea', 'adad63b72f1d6545b2ba2538c3fc2923b2fd5989',
    '2fimages', '2fcircle', '2fofficial','b1', 'ec', 'b7', 'c', 'in', 'entry', 'cf', 'reverse', 'd2', 'number', 'd4', 'や', 'reader', 'また', '7', 'b5', 'citation', '09', 'h1', 'はい', 'c7', 'e5', 
    'tag', '22', 'linkcode', 'th', 'psc', 'media', 'de', 'name', 'h5', '8', 'e6', 'a6', 'true', 'is', 'synpreproc', 'b2', '20230925', '13', 'd8', '20', 'twitter', 'bf', 'c3', 'ad', 'with', 'list', 
    # 追加のストップワード
    'none', '0px', 'auto', 'data', 'normal', '0s', 'unlink', '1f1f1f', 'loading', 'lazy', 'sans', 'itemprop', 'border', 'animation', 'ease', 'padding', 'box', '24px', 'display', 'margin', 'block', 'creator', 'r', 'serif', 'helvetica', 'iframe', 'orphans', 'transform',
    'widows', 'stroke', '000000', 'neue', 'background', 'spacing', 'all', '16px', 'motivation', 'duration', '600ms', 'running', 'appearance', 'repeat', 'scroll', 'rgba', 'inset', 'clear', 'clip', 'columns', 'contain', 'container', 'cursor', 'cx', 'cy', 
    'direction', 'filter', 'flex', 'gap', 'hyphens', 'manual', 'isolation', 'marker', 'mask', 'offset', 'opacity', 'order'
    
])

# 単語をカウントするためのCounterオブジェクトを作成
word_counter = Counter()

# 各エントリーのタイトルと説明から単語を抽出し、カウント
for item in root.findall(".//item"):
    title = item.find('title').text
    description = item.find('description').text
    content = title + " " + description
    words = re.findall(r'\w+', content.lower())
    words = [word for word in words if word not in stop_words]
    word_counter.update(words)

# 各URLを順番に参照
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 記事の本文を抽出
    article_content = soup.find('div', class_='entry-content').get_text()
    
    # 単語に分割（小文字に変換してから）
    words = re.findall(r'\w+', article_content.lower())
    
    # ストップワードを除外
    words = [word for word in words if word not in stop_words]
    
    # 単語をカウント
    word_counter.update(words)

# 最も頻繁に出現する単語を取得
hot_words = [word for word, count in word_counter.most_common(100)]

print("ホットワード:")
print(hot_words)

# ホットワードをスペースで区切って文字列に変換
text = ' '.join(hot_words)

# ワードクラウドの設定
wordcloud = WordCloud(background_color="white", width=800, height=600, font_path=r"C:\Windows\Fonts\YuGothB.ttc", contour_width=3, contour_color='steelblue', collocations=False, stopwords=set()).generate(text)

# 画像を表示
plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 画像を保存
wordcloud.to_file(r"C:\Users\any\Downloads\hotwords.png")

もし動かない場合は、以下のパッケージをインストールしてみてください。

pip install wordcloud
pip install matplotlib
pip install feedparser

スクリプトの説明

スクリプトはおおまかに４つで構成されております。

ブログのRSSをダウンロードして、保存する。（.xml形式）

import requests
from bs4 import BeautifulSoup
from collections import Counter
import re
import xml.etree.ElementTree as ET
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 対象のURL
url = "https://ai-sahara-navi.com/rss"

# ファイルの保存先
file_path = r"C:\Users\any\Downloads\ai-sahara-navi.com_rss.xml"

# URLからデータを取得
response = requests.get(url)
response.raise_for_status()  # エラーが発生した場合は例外を発生させる

# データをファイルに保存
with open(file_path, 'wb') as file:
    file.write(response.content)

ダウンロードしたxmlファイルから、linkを抽出する。

# XMLファイルを読み込む
tree = ET.parse(file_path)
root = tree.getroot()

# 'item'要素内の'link'要素を抜き出す
urls = [item.find('link').text for item in root.findall(".//item")]

linkを１つずつ確認し、頻出単語を集める。（除外単語指定あり）

# ストップワードのリスト
stop_words = set([
    # 既存のストップワード
    'さはら３', 'sahara3', 'aiさはら', 'dp', '6', '82', 'java', 'sort', 'a3', 'と', '81', 'range', 'd7', 'c0', 'hatenablog', 'b8', '1200', 'eb', '512', 'b8', '1200', 'eb', '512', '768', 'table',
    'of', 'contents', 'co', 'ab', '83', 'ff0000', 'テクノロジー', 'a2', 'c9', 'af', '9', 'で', 'rel', 'noopener', '11696248318754550877', 'align', 'right', 'i', 'main',
    'panel', 'cite', 'ff5252', '0', 'y', 'target', '_blank', 'b', '公式', '2023年開設ブログ', 'b3', 'b0', '2f1672022764175731', '2f62150696', '2fcircle_image', 'user', 
    'faa1264c227008e8b759458790977cdaf6601b23', '4207112889948320640', '10', 'asin', 'detail', 'count', 'b9', 'c6', 'circle', '2fcomputers', 'gif', 'p', 'span',
    'a', 'class', 'strong', 'https', 'li', 'href', 'keyword', 'hatena', 'code', 'div', 'com', 'jp', 'group', 'A5', 'ne', 'embed', 'style', 'd', 'ul', 'id', 'ai', 'sahara', 'navi', 'h3', 'rss',
    '1', 'synConstant', 'color', 'title', 'image', 'figure', 'height', 'h4', 'width', 'fotolife', 'font', 'pre', 'size', '150', 'a5', 'synconstant', 'st', 'link', 'blog', '2', '80', '40', 'br',
    'img', 'src', 'cdn', 'syncomment', '3', '2196f3', 'synidentifier', 'figcaption', 'hr', 'icon', 'quot', 'print', 'www', 'text', '4', 'dd830c', '5', 'lang', 'a4', 'alt', 'label', 'h2', 'png',
    'version', 'synstatement', 'js', 'redirect', 'square', 'backend', 'imagemagick', '3a', '2f', '2fcdn', 'content', 'ランキング参加中', 'images', 'f', 'my_list', 'blockquote', 'ak', 
    's', 'x', 'markdown', 'です', 'bc', 'e3', 'dir', 'ltr', 'f3', 'bb', 'decoration', 'underline', 'c8', 'a1', 'amp', 'mcenoneditable', 'mceeditable', 'e9', 'bd', 'ol', 'ea', 'adad63b72f1d6545b2ba2538c3fc2923b2fd5989',
    '2fimages', '2fcircle', '2fofficial','b1', 'ec', 'b7', 'c', 'in', 'entry', 'cf', 'reverse', 'd2', 'number', 'd4', 'や', 'reader', 'また', '7', 'b5', 'citation', '09', 'h1', 'はい', 'c7', 'e5', 
    'tag', '22', 'linkcode', 'th', 'psc', 'media', 'de', 'name', 'h5', '8', 'e6', 'a6', 'true', 'is', 'synpreproc', 'b2', '20230925', '13', 'd8', '20', 'twitter', 'bf', 'c3', 'ad', 'with', 'list', 
    # 追加のストップワード
    'none', '0px', 'auto', 'data', 'normal', '0s', 'unlink', '1f1f1f', 'loading', 'lazy', 'sans', 'itemprop', 'border', 'animation', 'ease', 'padding', 'box', '24px', 'display', 'margin', 'block', 'creator', 'r', 'serif', 'helvetica', 'iframe', 'orphans', 'transform',
    'widows', 'stroke', '000000', 'neue', 'background', 'spacing', 'all', '16px', 'motivation', 'duration', '600ms', 'running', 'appearance', 'repeat', 'scroll', 'rgba', 'inset', 'clear', 'clip', 'columns', 'contain', 'container', 'cursor', 'cx', 'cy', 
    'direction', 'filter', 'flex', 'gap', 'hyphens', 'manual', 'isolation', 'marker', 'mask', 'offset', 'opacity', 'order'
    
])

# 単語をカウントするためのCounterオブジェクトを作成
word_counter = Counter()

# 各エントリーのタイトルと説明から単語を抽出し、カウント
for item in root.findall(".//item"):
    title = item.find('title').text
    description = item.find('description').text
    content = title + " " + description
    words = re.findall(r'\w+', content.lower())
    words = [word for word in words if word not in stop_words]
    word_counter.update(words)

# 各URLを順番に参照
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 記事の本文を抽出
    article_content = soup.find('div', class_='entry-content').get_text()
    
    # 単語に分割（小文字に変換してから）
    words = re.findall(r'\w+', article_content.lower())
    
    # ストップワードを除外
    words = [word for word in words if word not in stop_words]
    
    # 単語をカウント
    word_counter.update(words)

# 最も頻繁に出現する単語を取得
hot_words = [word for word, count in word_counter.most_common(100)]

print("ホットワード:")
print(hot_words)

ワードクラウド画像を生成し、保存する。

# ホットワードをスペースで区切って文字列に変換
text = ' '.join(hot_words)

# ワードクラウドの設定
wordcloud = WordCloud(background_color="white", width=800, height=600, font_path=r"C:\Windows\Fonts\YuGothB.ttc", contour_width=3, contour_color='steelblue', collocations=False, stopwords=set()).generate(text)

# 画像を表示
plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 画像を保存
wordcloud.to_file(r"C:\Users\any\Downloads\hotwords.png")