1. menu_extraction.py - data.json 추출

1. menu_extraction.py - data.json 추출

2023. 11. 28. 17:47ㆍ개인 프로젝트/📚 톡방 통계 프로그램

1. menu_extraction.py 소스코드 전문

import json
import json_extraction
import json_extraction2
import json_extraction_kakaoPC1
import json_extraction_kakaoPC2
import json_extraction_kakaoMobile1
import json_extraction_kakaoMobile2



def main():
    # JSON 파일에서 데이터 읽기
    with open('config.json', "r", encoding="utf-8") as file:
        config_data = json.load(file)

    # 읽어온 데이터 사용하기
    print(config_data['input_file'])
    print(config_data['output_file'])

    while True:
        print("11. 텔레그램 data.json 추출")
        print("12. 텔레그램 dataYYYYMM.json 추출")
        print("21. 카카오톡 PC data.json 추출")
        print("22. 카카오톡 PC dataYYYYMM.json 추출")
        print("31. 카카오톡 Mobile data.json 추출")
        print("32. 카카오톡 Mobile dataYYYYMM.json 추출")
        print("0. 종료")
        choice = input("원하는 기능을 선택하세요: ")

        if choice == "11":
            json_extraction.run(config_data['input_file'], config_data['output_file'])
        elif choice == "12":
            json_extraction2.run(config_data['input_file'])
        elif choice == "21":
            json_extraction_kakaoPC1.run(config_data['input_file_kakao'])
        elif choice == "22":
            json_extraction_kakaoPC2.run(config_data['input_file_kakao'])
        elif choice == "31":
            json_extraction_kakaoMobile1.run(config_data['input_file_kakao'])
        elif choice == "32":
            json_extraction_kakaoMobile2.run(config_data['input_file_kakao'])
        elif choice == "0":
            break
        else:
            print("올바른 선택지를 입력하세요.")

코드를 보면 텔레그램과 카카오톡 PC, 카카오톡 Mobile의 세 가지로 나누었습니다. 각각 대화 데이터 추출 양식이 다르기에 구분지었습니다.

여섯 함수의 기능은 엇비슷하기에 하나를 예시로 들어 설명하겠습니다.

2. json_extraction - 텔레그램 data.json 추출

import json


def run(input_file, output_file):

    # 치환할 내용 설정
    replacement_text = "(인터넷공유)"

    # 필요한 정보를 저장할 리스트
    filtered_data = []

    # JSON 파일 읽기
    with open(input_file, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    # JSON 데이터 추출
    if isinstance(json_data, dict):
        # 필요한 정보 추출
        name = json_data.get("name")
        messages = json_data.get("messages", [])

        for message in messages:
            if isinstance(message, dict):
                # 필요한 정보 추출
                message_date = message.get("date")
                message_from = message.get("from")
                message_text = message.get("text") if "text" in message else "(NULL)"
                # 텍스트 치환
                if message_text and isinstance(message_text, list):
                    for i in range(len(message_text)):
                        if isinstance(message_text[i], dict):
                            if "type" in message_text[i] and message_text[i]["type"] == "link":
                                url = message_text[i].get("text")
                                if url and url.startswith("http"):
                                    message_text[i]["text"] = replacement_text

                # 필터링된 데이터를 새로운 JSON 형식으로 저장
                filtered_data.append({
                    "date": message_date,
                    "name": message_from,
                    "text": message_text
                })

    # 새로운 JSON 파일 저장
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered_data, f, indent=4, ensure_ascii=False)

    print("추출이 완료되었습니다.")

위를 보면 데이터를 가공하는 과정에서 텍스트 메세지만 취할 예정이므로, text 성격을 가지는 것들만 취합을 합니다. 또한 같은 분류를 가지는 내용 중에서 http로 시작하는 내용의 메세지는 게시글 공유를 하는 친구들 덕분에 추가했습니다. 하루에 20개 이상 스팸식으로 올리는 친구가 한 명이라도 있으면 그 친구의 메세지 기록이 오염되기 때문입니다.

이후 가공된 데이터는 ./src/ 안에 저장이 됩니다. 아래는 결과 예시입니다.