[PaddleOCR] 학습용 REC 데이터셋 생성 스크립트(업데이트 버전)

기존 코드 데이터셋 생성 스크립트에서 단어구조 안정화, 한글 + 영어 데이터셋을 생성, 한글 폰트 깨짐을 폰트 다운로드를 통해 깨짐 방지를 먼저함.

LEXER_MAP = {
    '.py': 'python',
    '.js': 'javascript',
    '.mjs': 'javascript',
    '.ts': 'typescript',
    '.tsx': 'tsx',
    '.html': 'html',
    '.css': 'css',
    '.yml': 'yaml',
    '.yaml': 'yaml',
}

특정 언어에서 많이 보이는 패턴들의 기호들을 추출해냄.

SYMBOLS = [
    '(', ')', '{', '}', '[', ']', '.', ':', ';', '=', '+', '-', '*', '/', '<', '>', 
    '|', '_', '&', '$', '@', '!', '?', '%', '^', '~', '`', '#', '\\', '"', "'", ',',
    '::', '...', '->', '=>', '&&', '||', '??', '?:', '===', '!==', '>>', '<<', '>>>',
    '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '<=', '>=', '==', '!=',
    '\\n', '\\t', '\\r', '\\"', "\\'", '\\\\',
]

코드 전체 예시

"""
PaddleOCR 학습용 REC 데이터셋 생성 스크립트 (고급 버전)

코드 캡쳐본에서 영어+한글+특수기호를 공백/레이아웃까지 최대한 보존하여 추출
들여쓰기는 라벨에서 제거하고 bbox 기반 후처리로 복원 (정책1)

특징:
- 버킷 기반 샘플링 (word_short, phrase_short, line_mid, line_long, symbol_focus, korean_focus)
- 한글 데이터 비율 강제 (35~50%)
- 실제 캡쳐 노이즈 증강 (30~60%)
- 롱 라인 샘플 (120~220자, 10~20%)
- 기호 커버리지 강화
- CLI 옵션 지원
"""

import sys
import os
import random
import re
import argparse
from pathlib import Path
from typing import List, Tuple, Optional, Dict
from tqdm import tqdm
import numpy as np
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
import cv2

# chardet 라이브러리 (인코딩 자동 감지)
try:
    import chardet
    CHARDET_AVAILABLE = True
except ImportError:
    CHARDET_AVAILABLE = False
    print("[WARN] chardet이 설치되지 않았습니다. UTF-8로만 읽습니다.")

# Pygments 관련
try:
    from pygments.lexers import get_lexer_by_name
    PYGMENTS_AVAILABLE = True
except ImportError:
    PYGMENTS_AVAILABLE = False
    print("[WARN] Pygments가 설치되지 않았습니다. 문법 강조 없이 진행합니다.")

# UTF-8 출력 설정
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# ============================================================================
# 설정 (CLI로 오버라이드 가능)
# ============================================================================

# 프로젝트 루트
WORKSPACE_ROOT = Path(__file__).parent.parent.parent.parent
CORE_TRAIN_DIR = Path(__file__).parent
TRAIN_DATA_DIR = CORE_TRAIN_DIR / "train_data"
REC_DIR = TRAIN_DATA_DIR / "rec"
REC_TRAIN_IMAGES_DIR = REC_DIR / "train"
REC_VAL_IMAGES_DIR = REC_DIR / "val"
REC_GT_TRAIN_FILE = REC_DIR / "rec_gt_train.txt"
REC_GT_VAL_FILE = REC_DIR / "rec_gt_test.txt"

# 기본 목표 개수
DEFAULT_TARGET_TRAIN_COUNT = 25000
DEFAULT_TARGET_VAL_COUNT = 5000

# 제외할 디렉토리/파일
EXCLUDE_DIRS = {
    '__pycache__', 'node_modules', '.git', 'venv', 'venv_ocr',
    'outputs', 'output', 'train_data', 'code_syntax_dataset',
    '.next', 'dist', 'build', '.venv'
}

EXCLUDE_FILES = {
    'rec_gt.txt', 'rec_gt_train.txt', 'rec_gt_test.txt'
}

# 지원하는 파일 확장자
CODE_EXTENSIONS = {'.py', '.js', '.html', '.css', '.yml', '.yaml', '.ts', '.tsx', '.mjs'}

# 파일 확장자별 Lexer 매핑
LEXER_MAP = {
    '.py': 'python',
    '.js': 'javascript',
    '.mjs': 'javascript',
    '.ts': 'typescript',
    '.tsx': 'tsx',
    '.html': 'html',
    '.css': 'css',
    '.yml': 'yaml',
    '.yaml': 'yaml',
}

# 폰트 우선순위 (영문 + 한글 지원 모노스페이스 폰트)
FONT_PRIORITY = [
    'D2Coding',  # 한글 지원 모노스페이스 (우선순위 1)
    'D2CodingBold',
    'Noto Sans Mono CJK KR',  # 한글 지원 모노스페이스
    'Malgun Gothic',  # Windows 한글 기본 폰트 (모노스페이스는 아니지만 대체용)
    'Fira Code',
    'JetBrains Mono',
    'Consolas',
    'Courier New',
    'Lucida Console',
    'Monaco',
    'Menlo',
    'DejaVu Sans Mono',
]

# 한글 지원 폰트 필수 여부 (CLI로 오버라이드)
DEFAULT_REQUIRE_KOREAN_FONT = False

# 버킷 설정 (CLI로 오버라이드 가능)
BUCKET_RATIOS = {
    'word_short': 0.25,      # 1~20자 단어
    'phrase_short': 0.15,    # 2~3단어 <= 25자
    'line_mid': 0.30,        # 40~120자
    'line_long': 0.15,       # 120~220자
    'symbol_focus': 0.10,    # 기호 집중
    'korean_focus': 0.05,    # 한글 집중 (추가로 한글 비율 35~50% 강제)
}

# 한글 비율 강제 (전체 샘플 중 최소 비율)
KOREAN_MIN_RATIO = 0.35
KOREAN_MAX_RATIO = 0.50

# 증강 설정
AUGMENTATION_PROB = 0.45  # 30~60% 범위, 기본 45%
AUGMENTATION_CONFIG = {
    'scale_range': (0.85, 1.15),
    'blur_sigma': (0.3, 0.8),
    'sharpen_factor': (0.9, 1.1),
    'jpeg_quality_range': (60, 95),
    'noise_std': 3.0,
    'rotation_range': (-1.0, 1.0),
}

# 라인 샘플 길이 범위
LINE_MID_MIN = 40
LINE_MID_MAX = 120
LINE_LONG_MIN = 120
LINE_LONG_MAX = 220

# 특수 기호 집중 학습용 기호 (강화)
SYMBOLS = [
    '(', ')', '{', '}', '[', ']', '.', ':', ';', '=', '+', '-', '*', '/', '<', '>', 
    '|', '_', '&', '$', '@', '!', '?', '%', '^', '~', '`', '#', '\\', '"', "'", ',',
    '::', '...', '->', '=>', '&&', '||', '??', '?:', '===', '!==', '>>', '<<', '>>>',
    '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '<=', '>=', '==', '!=',
    '\\n', '\\t', '\\r', '\\"', "\\'", '\\\\',
]

# 한글 주석/문자열 템플릿 (fallback용)
KOREAN_TEMPLATES = [
    # 주석
    '# 한글 주석 예시',
    '// 한글 주석',
    '/* 한글 주석 */',
    '# TODO: 한글 작업',
    '# FIXME: 한글 버그 수정',
    # 문자열
    '"한글 문자열"',
    "'한글 문자열'",
    '`한글 템플릿 ${변수}`',
    'f"한글 포맷 {value}"',
    # 변수명/함수명
    '한글변수 = 10',
    'def 한글함수():',
    'class 한글클래스:',
    # 혼합
    'print("한글 출력")',
    'console.log("한글 로그")',
    'return "한글 반환값"',
]

# 테마 설정
DARK_THEMES = [
    {
        'bg': (30, 30, 30),
        'text': (212, 212, 212),
        'comment': (106, 153, 85),
        'keyword': (86, 156, 214),
        'string': (206, 145, 120),
        'number': (181, 206, 168),
        'name': (156, 220, 254),
    },
    {
        'bg': (25, 25, 35),
        'text': (200, 200, 220),
        'comment': (100, 150, 100),
        'keyword': (80, 150, 210),
        'string': (200, 140, 115),
        'number': (175, 200, 165),
        'name': (150, 215, 250),
    },
    {
        'bg': (28, 28, 28),
        'text': (248, 248, 242),
        'comment': (117, 113, 94),
        'keyword': (249, 38, 114),
        'string': (230, 219, 116),
        'number': (174, 129, 255),
        'name': (102, 217, 239),
    },
]

LIGHT_THEMES = [
    {
        'bg': (255, 255, 255),
        'text': (30, 30, 30),
        'comment': (0, 128, 0),
        'keyword': (0, 0, 255),
        'string': (163, 21, 21),
        'number': (0, 128, 128),
        'name': (0, 0, 128),
    },
]

# ============================================================================
# 유틸리티 함수
# ============================================================================

def contains_hangul(text: str) -> bool:
    """한글 포함 여부 확인"""
    return any('\uac00' <= char <= '\ud7a3' for char in text)


def has_symbol(text: str) -> bool:
    """특수 기호 포함 여부 확인"""
    return any(sym in text for sym in SYMBOLS)


def find_code_files(root: Path) -> List[Path]:
    """워크스페이스 내 모든 코드 파일 찾기"""
    code_files = []
    for ext in CODE_EXTENSIONS:
        for file_path in root.rglob(f'*{ext}'):
            if any(excluded in file_path.parts for excluded in EXCLUDE_DIRS):
                continue
            if file_path.name in EXCLUDE_FILES:
                continue
            if file_path.is_file() and file_path.stat().st_size > 0:
                code_files.append(file_path)
    return code_files


def extract_code_lines(file_path: Path) -> List[str]:
    """파일에서 코드 라인 추출 (한글 포함, 들여쓰기 선행 공백 보존)"""
    lines = []
    try:
        if CHARDET_AVAILABLE:
            rawdata = file_path.read_bytes()
            result = chardet.detect(rawdata)
            encoding = result['encoding'] if result['encoding'] else 'utf-8'
            try:
                content = rawdata.decode('utf-8-sig', errors='replace')
            except:
                try:
                    content = rawdata.decode(encoding, errors='replace')
                except:
                    content = rawdata.decode('utf-8', errors='replace')
        else:
            try:
                content = file_path.read_text(encoding='utf-8-sig', errors='replace')
            except:
                content = file_path.read_text(encoding='utf-8', errors='replace')
    except Exception:
        return lines
    
    for line in content.split('\n'):
        line = line.rstrip('\n\r')
        if not line.strip():
            continue
        if len(line.strip()) < 2:
            continue
        if len(line) > 250:  # 롱 라인까지 포함하도록 상향
            continue
        lines.append(line)
    
    return lines


def split_into_words_and_phrases(line: str) -> List[str]:
    """라인을 단어 및 짧은 구문으로 분할 (정책1: 선행 공백 제거)"""
    segments = []
    line_stripped = line.lstrip()
    if not line_stripped:
        return segments
    
    words = line_stripped.split()
    if not words:
        return segments
    
    # 1. 단일 단어/기호 (word_short)
    for word in words:
        if len(word) <= 20:
            segments.append(('word_short', word))
    
    # 2. 2개 단어 조합 (phrase_short)
    for i in range(len(words) - 1):
        combo = words[i] + ' ' + words[i+1]
        if len(combo) <= 25:
            segments.append(('phrase_short', combo))
    
    # 3. 3개 단어 조합 (phrase_short)
    for i in range(len(words) - 2):
        combo = words[i] + ' ' + words[i+1] + ' ' + words[i+2]
        if len(combo) <= 25:
            segments.append(('phrase_short', combo))
    
    return segments


def extract_line_samples(lines: List[str]) -> Tuple[List[str], List[str]]:
    """라인/구문 단위 샘플 추출 (mid: 40~120자, long: 120~220자)"""
    line_mid = []
    line_long = []
    
    for line in lines:
        line_stripped = line.lstrip()
        if not line_stripped:
            continue
        length = len(line_stripped)
        if LINE_MID_MIN <= length <= LINE_MID_MAX:
            line_mid.append(line_stripped)
        elif LINE_LONG_MIN <= length <= LINE_LONG_MAX:
            line_long.append(line_stripped)
    
    return line_mid, line_long


def generate_symbol_focused_segments() -> List[str]:
    """기호 집중 학습용 세그먼트 생성 (강화)"""
    segments = []
    
    # 단독 기호
    for sym in SYMBOLS[:30]:
        segments.append(sym)
    
    # 기호 조합 (강화)
    symbol_combos = [
        '()', '{}', '[]', '==', '!=', '<=', '>=', '+=', '-=', '*=', '/=',
        '->', '=>', '::', '...', '/*', '*/', '//', '&&', '||', '??', '?:',
        '===', '!==', '>>', '<<', '>>>', '+=', '-=', '*=', '/=', '%=',
        '&=', '|=', '^=', '<<=', '>>=', '\\n', '\\t', '\\"', "\\'", '\\\\',
    ]
    segments.extend(symbol_combos)
    
    # 기호 + 식별자/숫자/괄호/공백 혼합 (비중 높임)
    common_words = ['if', 'for', 'while', 'def', 'class', 'import', 'from', 'return', 
                    'const', 'let', 'var', 'function', 'async', 'await', 'export']
    operators = ['(', '{', '[', '.', ':', '=', '+', '-', '*', '/', '<', '>', '|', '&']
    
    for word in common_words:
        for op in operators:
            # 기호 + 단어
            segments.append(op + word)
            segments.append(word + op)
            segments.append(op + ' ' + word)
            segments.append(word + ' ' + op)
            # 기호 + 숫자
            for num in ['0', '1', '2', '10', '100']:
                segments.append(op + num)
                segments.append(num + op)
    
    # 정규식/경로/URL 형태
    regex_patterns = [
        r'/^[a-z]+$/',
        r'/\d+/g',
        r'\.(jpg|png|gif)',
        r'https?://[^\s]+',
        r'C:\\Users\\[^\\]+',
        r'/[a-z]+/[a-z]+',
    ]
    segments.extend(regex_patterns)
    
    # 문자열 이스케이프
    escape_patterns = [
        r'\\n', r'\\t', r'\\r', r'\\"', r"\\'", r'\\\\',
        r'\u0041', r'\uAC00', r'\n', r'\t',
    ]
    segments.extend(escape_patterns)
    
    return segments


def generate_korean_synthetic_segments() -> List[str]:
    """한글 합성 세그먼트 생성 (fallback용)"""
    segments = []
    
    # 템플릿 기반 생성
    for template in KOREAN_TEMPLATES:
        segments.append(template)
    
    # 코드 라인에 한글 삽입
    code_prefixes = ['def ', 'class ', 'const ', 'let ', 'var ', 'function ']
    korean_suffixes = ['한글함수', '한글클래스', '한글변수', '한글상수']
    
    for prefix in code_prefixes:
        for suffix in korean_suffixes:
            segments.append(prefix + suffix + '()')
            segments.append(prefix + suffix + ' = value')
    
    return segments


def load_font(font_size: int, require_korean: bool = False) -> Tuple[Optional[ImageFont.FreeTypeFont], bool]:
    """폰트 로드 (한글 지원 여부 확인)"""
    korean_font_loaded = False
    
    for font_name in FONT_PRIORITY:
        try:
            if sys.platform == 'win32':
                font_paths = [
                    f"C:/Windows/Fonts/{font_name}.ttf",
                    f"C:/Windows/Fonts/{font_name}.otf",
                    f"C:/Windows/Fonts/{font_name.replace(' ', '')}.ttf",
                    f"C:/Windows/Fonts/{font_name.replace(' ', '')}.otf",
                ]
                for font_path in font_paths:
                    if os.path.exists(font_path):
                        try:
                            font = ImageFont.truetype(font_path, font_size)
                            if font_name in ['D2Coding', 'D2CodingBold', 'Noto Sans Mono CJK KR', 'Malgun Gothic']:
                                korean_font_loaded = True
                            return font, korean_font_loaded
                        except Exception:
                            continue
            try:
                font = ImageFont.truetype(font_name, font_size)
                if font_name in ['D2Coding', 'D2CodingBold', 'Noto Sans Mono CJK KR', 'Malgun Gothic']:
                    korean_font_loaded = True
                return font, korean_font_loaded
            except:
                continue
        except:
            continue
    
    try:
        return ImageFont.load_default(), False
    except:
        return None, False


def tokenize_with_pygments(code: str, file_ext: str) -> List[Tuple[str, str]]:
    """Pygments로 토큰화"""
    if not PYGMENTS_AVAILABLE:
        return [('Text', code)]
    try:
        lexer_name = LEXER_MAP.get(file_ext, 'text')
        lexer = get_lexer_by_name(lexer_name)
        tokens = list(lexer.get_tokens(code))
        return [(str(token_type), text) for token_type, text in tokens]
    except:
        return [('Text', code)]


def get_token_color(token_type: str, theme: dict) -> tuple:
    """토큰 타입에 따른 색상 반환"""
    token_str = str(token_type)
    if 'Comment' in token_str:
        return theme.get('comment', theme['text'])
    elif 'Keyword' in token_str or 'Name.Builtin' in token_str:
        return theme.get('keyword', theme['text'])
    elif 'String' in token_str:
        return theme.get('string', theme['text'])
    elif 'Number' in token_str:
        return theme.get('number', theme['text'])
    elif 'Name' in token_str:
        return theme.get('name', theme['text'])
    else:
        return theme['text']


def render_text_with_syntax(
    text: str,
    file_ext: str,
    font: ImageFont.FreeTypeFont,
    theme: dict,
    target_height: int = 48,
    max_width: int = 640,
    padding: int = 10,
    min_font_size: int = 8
) -> Image.Image:
    """문법 강조를 적용한 텍스트 이미지 생성 (롱 라인 지원)"""
    tokens = tokenize_with_pygments(text, file_ext)
    
    temp_img = Image.new("RGB", (3000, 200), theme['bg'])
    temp_draw = ImageDraw.Draw(temp_img)
    
    total_width = 0
    max_token_height = 0
    
    for token_type, token_text in tokens:
        bbox = temp_draw.textbbox((0, 0), token_text, font=font)
        token_width = bbox[2] - bbox[0]
        token_height = bbox[3] - bbox[1]
        total_width += token_width
        max_token_height = max(max_token_height, token_height)
    
    available_width = max_width - (padding * 2)
    available_height = target_height - (padding * 2)
    
    # 폰트 크기 조정 (롱 라인 지원)
    if total_width > available_width or max_token_height > available_height:
        scale_w = available_width / total_width if total_width > 0 else 1.0
        scale_h = available_height / max_token_height if max_token_height > 0 else 1.0
        scale = min(scale_w, scale_h, 1.0) * 0.90
        
        try:
            current_size = font.size if hasattr(font, 'size') else 16
        except:
            current_size = 16
        new_size = max(min_font_size, int(current_size * scale))
        new_font, _ = load_font(new_size, require_korean=False)
        if new_font is not None:
            font = new_font
        
        # 다시 측정
        temp_draw = ImageDraw.Draw(temp_img)
        total_width = 0
        max_token_height = 0
        for token_type, token_text in tokens:
            bbox = temp_draw.textbbox((0, 0), token_text, font=font)
            token_width = bbox[2] - bbox[0]
            token_height = bbox[3] - bbox[1]
            total_width += token_width
            max_token_height = max(max_token_height, token_height)
    
    # 최종 이미지 크기 계산 (롱 라인은 우측 패딩 확장)
    if total_width + padding * 2 > max_width:
        # 우측 패딩 확장 (글자 간격 유지)
        final_width = int(total_width + padding * 2 + 4)
    else:
        final_width = min(max_width, int(total_width + padding * 2 + 4))
    
    final_height = target_height
    
    img = Image.new("RGB", (final_width, final_height), theme['bg'])
    draw = ImageDraw.Draw(img)
    
    x_pos = padding
    y_pos = (final_height - max_token_height) // 2
    
    for token_type, token_text in tokens:
        color = get_token_color(token_type, theme)
        draw.text((x_pos, y_pos), token_text, fill=color, font=font)
        bbox = draw.textbbox((x_pos, y_pos), token_text, font=font)
        x_pos = bbox[2]
    
    return img


def apply_augmentation(img: Image.Image, config: Dict) -> Image.Image:
    """실제 캡쳐 노이즈 증강 적용"""
    img_array = np.array(img)
    
    # 스케일 변화 + 리샘플링
    if random.random() < 0.7:
        scale = random.uniform(*config['scale_range'])
        h, w = img_array.shape[:2]
        new_h, new_w = int(h * scale), int(w * scale)
        img_array = cv2.resize(img_array, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        img_array = cv2.resize(img_array, (w, h), interpolation=cv2.INTER_LINEAR)
        img = Image.fromarray(img_array)
    
    # 약한 블러
    if random.random() < 0.4:
        sigma = random.uniform(*config['blur_sigma'])
        img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
    
    # 샤프닝 편차
    if random.random() < 0.3:
        factor = random.uniform(*config['sharpen_factor'])
        enhancer = ImageEnhance.Sharpness(img)
        img = enhancer.enhance(factor)
    
    # 가우시안 노이즈
    if random.random() < 0.3:
        noise = np.random.normal(0, config['noise_std'], img_array.shape).astype(np.float32)
        img_array = np.clip(img_array.astype(np.float32) + noise, 0, 255).astype(np.uint8)
        img = Image.fromarray(img_array)
    
    # 아주 약한 회전
    if random.random() < 0.2:
        angle = random.uniform(*config['rotation_range'])
        img = img.rotate(angle, resample=Image.BICUBIC, expand=False, fillcolor=img.getpixel((0, 0)))
    
    return img


def enhance_image_quality(img: Image.Image) -> Image.Image:
    """이미지 품질 향상"""
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(1.2)
    enhancer = ImageEnhance.Sharpness(img)
    img = enhancer.enhance(1.3)
    return img


def is_valid_segment(segment: str, max_length: int = 220) -> bool:
    """세그먼트 유효성 검증"""
    if len(segment.strip()) < 1:
        return False
    if len(segment) > max_length:
        return False
    if re.match(r'^\d+$', segment.strip()):
        return False
    return True


def collect_segments_by_bucket() -> Dict[str, List[str]]:
    """버킷별 세그먼트 수집"""
    print("=" * 70)
    print("코드 파일 스캔 및 버킷별 세그먼트 추출 중...")
    print("=" * 70)
    
    code_files = find_code_files(WORKSPACE_ROOT)
    print(f"발견된 코드 파일: {len(code_files):,}개")
    
    buckets = {
        'word_short': [],
        'phrase_short': [],
        'line_mid': [],
        'line_long': [],
        'symbol_focus': [],
        'korean_focus': [],
    }
    
    all_lines = []
    korean_lines = []
    
    for file_path in tqdm(code_files, desc="파일 읽기"):
        lines = extract_code_lines(file_path)
        all_lines.extend(lines)
        
        # 한글 포함 라인 우선 수집
        for line in lines:
            if contains_hangul(line):
                korean_lines.append(line)
        
        # 단어/구문 단위 세그먼트
        for line in lines:
            segments = split_into_words_and_phrases(line)
            for bucket_type, segment in segments:
                if is_valid_segment(segment, max_length=25):
                    if contains_hangul(segment):
                        buckets['korean_focus'].append(segment)
                    elif has_symbol(segment):
                        buckets['symbol_focus'].append(segment)
                    else:
                        buckets[bucket_type].append(segment)
    
    # 라인 샘플 추출
    line_mid, line_long = extract_line_samples(all_lines)
    buckets['line_mid'].extend(line_mid)
    buckets['line_long'].extend(line_long)
    
    # 한글 라인 샘플 추가
    korean_mid, korean_long = extract_line_samples(korean_lines)
    buckets['line_mid'].extend(korean_mid)
    buckets['line_long'].extend(korean_long)
    buckets['korean_focus'].extend(korean_mid[:100])  # 한글 라인 샘플 일부 추가
    buckets['korean_focus'].extend(korean_long[:50])
    
    # 기호 집중 세그먼트 추가
    symbol_focused = generate_symbol_focused_segments()
    buckets['symbol_focus'].extend(symbol_focused)
    
    # 한글 합성 세그먼트 (fallback)
    if len(buckets['korean_focus']) < 1000:
        korean_synthetic = generate_korean_synthetic_segments()
        buckets['korean_focus'].extend(korean_synthetic)
    
    print(f"\n추출된 세그먼트 (버킷별):")
    for bucket_name, segments in buckets.items():
        print(f"  {bucket_name}: {len(segments):,}개")
    
    return buckets


def sample_with_korean_ratio(
    buckets: Dict[str, List[str]],
    total_count: int,
    korean_min_ratio: float,
    korean_max_ratio: float,
    bucket_ratios: Dict[str, float]
) -> List[str]:
    """한글 비율을 강제하여 샘플링"""
    all_segments = []
    
    # 버킷별 목표 개수 계산
    bucket_targets = {}
    for bucket_name, ratio in bucket_ratios.items():
        bucket_targets[bucket_name] = int(total_count * ratio)
    
    # 한글 최소 개수 계산
    korean_min_count = int(total_count * korean_min_ratio)
    korean_max_count = int(total_count * korean_max_ratio)
    
    # 각 버킷에서 샘플링
    for bucket_name, target_count in bucket_targets.items():
        available = buckets.get(bucket_name, [])
        if len(available) >= target_count:
            sampled = random.sample(available, target_count)
        else:
            sampled = available.copy()
            # 부족하면 변형으로 다양성 확보
            while len(sampled) < target_count and len(available) > 0:
                base = random.choice(available)
                # 간단한 변형 (공백 추가/제거, 기호 변경 등)
                variants = [
                    base,
                    base.replace('  ', ' '),
                    base.replace(' ', '  '),
                ]
                sampled.extend(variants[:target_count - len(sampled)])
        all_segments.extend(sampled)
    
    # 한글 비율 확인 및 조정
    korean_count = sum(1 for s in all_segments if contains_hangul(s))
    
    if korean_count < korean_min_count:
        # 한글 부족 시 korean_focus 버킷에서 추가
        korean_needed = korean_min_count - korean_count
        korean_available = [s for s in buckets['korean_focus'] if s not in all_segments]
        if len(korean_available) >= korean_needed:
            all_segments.extend(random.sample(korean_available, korean_needed))
        else:
            all_segments.extend(korean_available)
            # 합성 세그먼트 추가
            synthetic = generate_korean_synthetic_segments()
            all_segments.extend(synthetic[:korean_needed - len(korean_available)])
    
    elif korean_count > korean_max_count:
        # 한글 과다 시 일부 제거
        korean_segments = [s for s in all_segments if contains_hangul(s)]
        non_korean_segments = [s for s in all_segments if not contains_hangul(s)]
        excess = korean_count - korean_max_count
        korean_segments = korean_segments[:-excess] if excess < len(korean_segments) else []
        all_segments = korean_segments + non_korean_segments
    
    random.shuffle(all_segments)
    return all_segments[:total_count]


def generate_dataset(
    target_train: int,
    target_val: int,
    require_korean_font: bool,
    korean_min_ratio: float,
    korean_max_ratio: float,
    augmentation_prob: float,
    bucket_ratios: Dict[str, float]
):
    """데이터셋 생성 (Train + Validation)"""
    print("\n" + "=" * 70)
    print("PaddleOCR REC 데이터셋 생성 (고급 버전)")
    print(f"목표: Train {target_train:,}개 + Val {target_val:,}개 = 총 {target_train + target_val:,}개")
    print("=" * 70)
    
    # 디렉토리 생성
    REC_TRAIN_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    REC_VAL_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    
    # 버킷별 세그먼트 수집
    buckets = collect_segments_by_bucket()
    
    total_count = target_train + target_val
    
    # 샘플링 (한글 비율 강제)
    all_segments = sample_with_korean_ratio(
        buckets, total_count, korean_min_ratio, korean_max_ratio, bucket_ratios
    )
    
    # 폰트 로드
    font, has_korean = load_font(16, require_korean=require_korean_font)
    if font is None:
        print("[ERROR] 폰트를 로드할 수 없습니다.")
        return
    
    if require_korean_font and not has_korean:
        print("[ERROR] 한글 지원 폰트를 찾을 수 없습니다!")
        print("[INFO] 다음 폰트 중 하나를 설치해주세요:")
        print("  - D2Coding: https://github.com/naver/d2codingfont")
        print("  - Noto Sans Mono CJK KR")
        return
    
    if has_korean:
        print(f"[INFO] 한글 지원 폰트 로드 완료")
    else:
        print("[WARN] 한글 지원 폰트를 찾을 수 없습니다. 영문 폰트로 진행합니다.")
    
    # 통계 수집용
    stats = {
        'korean_count': 0,
        'symbol_count': 0,
        'augmented_count': 0,
        'bucket_counts': {name: 0 for name in bucket_ratios.keys()},
    }
    
    # Train 데이터셋 생성
    print(f"\n[1/2] Train 이미지 생성 중... ({target_train:,}개)")
    train_labels = []
    train_segments = all_segments[:target_train]
    
    random.seed(42)
    
    for i in tqdm(range(target_train), desc="Train 이미지 생성"):
        segment = train_segments[i]
        
        # 버킷 분류
        length = len(segment)
        if length <= 20:
            bucket_type = 'word_short'
        elif length <= 25:
            bucket_type = 'phrase_short'
        elif length <= 120:
            bucket_type = 'line_mid'
        elif length <= 220:
            bucket_type = 'line_long'
        else:
            bucket_type = 'line_long'
        
        stats['bucket_counts'][bucket_type] = stats['bucket_counts'].get(bucket_type, 0) + 1
        
        if contains_hangul(segment):
            stats['korean_count'] += 1
        if has_symbol(segment):
            stats['symbol_count'] += 1
        
        # 테마 선택
        if random.random() < 0.8:
            theme = random.choice(DARK_THEMES)
        else:
            theme = random.choice(LIGHT_THEMES)
        
        # 파일 확장자 랜덤 선택
        file_ext = random.choice(['.py', '.js', '.ts', '.html', '.css'])
        
        # 이미지 생성 (롱 라인은 더 넓은 이미지)
        if bucket_type == 'line_long':
            max_img_width = 1600
            min_font_size = 8
        elif bucket_type == 'line_mid':
            max_img_width = 960
            min_font_size = 10
        else:
            max_img_width = 640
            min_font_size = 10
        
        img = render_text_with_syntax(
            segment,
            file_ext,
            font,
            theme,
            target_height=48,
            max_width=max_img_width,
            padding=10,
            min_font_size=min_font_size
        )
        
        # 증강 적용
        should_augment = random.random() < augmentation_prob
        if should_augment:
            img = apply_augmentation(img, AUGMENTATION_CONFIG)
            stats['augmented_count'] += 1
        else:
            img = enhance_image_quality(img)
        
        # 저장 (증강 시 JPEG 품질 낮춤)
        img_name = f"word_{i + 1:06d}.jpg"
        img_path = REC_TRAIN_IMAGES_DIR / img_name
        jpeg_quality = random.randint(60, 95) if should_augment else 95
        img.save(img_path, "JPEG", quality=jpeg_quality, optimize=True)
        
        # 라벨 추가
        rel_path = f"train/{img_name}"
        clean_label = segment.replace('\t', ' ').replace('\n', ' ').lstrip().rstrip()
        train_labels.append(f"{rel_path}\t{clean_label}\n")
    
    # Validation 데이터셋 생성
    print(f"\n[2/2] Validation 이미지 생성 중... ({target_val:,}개)")
    val_labels = []
    val_segments = all_segments[target_train:target_train + target_val]
    
    random.seed(123)
    
    for i in tqdm(range(target_val), desc="Val 이미지 생성"):
        if i >= len(val_segments):
            break
        segment = val_segments[i]
        
        # 버킷 분류
        length = len(segment)
        if length <= 20:
            bucket_type = 'word_short'
        elif length <= 25:
            bucket_type = 'phrase_short'
        elif length <= 120:
            bucket_type = 'line_mid'
        else:
            bucket_type = 'line_long'
        
        if contains_hangul(segment):
            stats['korean_count'] += 1
        if has_symbol(segment):
            stats['symbol_count'] += 1
        
        # 테마 선택
        if random.random() < 0.8:
            theme = random.choice(DARK_THEMES)
        else:
            theme = random.choice(LIGHT_THEMES)
        
        # 파일 확장자 랜덤 선택
        file_ext = random.choice(['.py', '.js', '.ts', '.html', '.css'])
        
        # 이미지 생성
        if bucket_type == 'line_long':
            max_img_width = 1600
            min_font_size = 8
        elif bucket_type == 'line_mid':
            max_img_width = 960
            min_font_size = 10
        else:
            max_img_width = 640
            min_font_size = 10
        
        img = render_text_with_syntax(
            segment,
            file_ext,
            font,
            theme,
            target_height=48,
            max_width=max_img_width,
            padding=10,
            min_font_size=min_font_size
        )
        
        # 증강 적용
        should_augment = random.random() < augmentation_prob
        if should_augment:
            img = apply_augmentation(img, AUGMENTATION_CONFIG)
            stats['augmented_count'] += 1
        else:
            img = enhance_image_quality(img)
        
        # 저장
        img_name = f"word_{i + 1:06d}.jpg"
        img_path = REC_VAL_IMAGES_DIR / img_name
        jpeg_quality = random.randint(60, 95) if should_augment else 95
        img.save(img_path, "JPEG", quality=jpeg_quality, optimize=True)
        
        # 라벨 추가
        rel_path = f"val/{img_name}"
        clean_label = segment.replace('\t', ' ').replace('\n', ' ').lstrip().rstrip()
        val_labels.append(f"{rel_path}\t{clean_label}\n")
    
    # 라벨 파일 저장
    print("\n라벨 파일 저장 중...")
    with open(REC_GT_TRAIN_FILE, 'w', encoding='utf-8', newline='\n') as f:
        f.writelines(train_labels)
    
    with open(REC_GT_VAL_FILE, 'w', encoding='utf-8', newline='\n') as f:
        f.writelines(val_labels)
    
    # 최종 통계 출력
    total_generated = len(train_labels) + len(val_labels)
    print("\n" + "=" * 70)
    print("[OK] 데이터셋 생성 완료!")
    print("=" * 70)
    print(f"\n📊 생성 결과:")
    print(f"  Train 이미지: {len(train_labels):,}개")
    print(f"  Val 이미지: {len(val_labels):,}개")
    print(f"  총 이미지: {total_generated:,}개")
    
    print(f"\n📈 통계:")
    print(f"  한글 포함: {stats['korean_count']:,}개 ({stats['korean_count']/total_generated*100:.1f}%)")
    print(f"  기호 포함: {stats['symbol_count']:,}개 ({stats['symbol_count']/total_generated*100:.1f}%)")
    print(f"  증강 적용: {stats['augmented_count']:,}개 ({stats['augmented_count']/total_generated*100:.1f}%)")
    print(f"\n  버킷별 분포:")
    for bucket_name, count in stats['bucket_counts'].items():
        if count > 0:
            print(f"    {bucket_name}: {count:,}개 ({count/total_generated*100:.1f}%)")
    
    print(f"\n📁 저장 위치:")
    print(f"  Train 이미지: {REC_TRAIN_IMAGES_DIR}")
    print(f"  Val 이미지: {REC_VAL_IMAGES_DIR}")
    print(f"  Train 라벨: {REC_GT_TRAIN_FILE}")
    print(f"  Val 라벨: {REC_GT_VAL_FILE}")
    
    print(f"\n✨ 적용된 기능:")
    print(f"  - 버킷 기반 샘플링 (6개 버킷)")
    print(f"  - 한글 비율 강제: {korean_min_ratio*100:.0f}~{korean_max_ratio*100:.0f}%")
    print(f"  - 증강 적용: {augmentation_prob*100:.0f}%")
    print(f"  - 롱 라인 지원: 120~220자 (최대 1600px)")
    print(f"  - 기호 커버리지 강화")
    print(f"  - 한글 지원 폰트: {'사용' if has_korean else '미사용'}")


def main():
    """CLI 진입점"""
    parser = argparse.ArgumentParser(
        description='PaddleOCR REC 데이터셋 생성 스크립트 (고급 버전)',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
예시:
  # 기본 설정 (Train 25,000, Val 5,000)
  python 24_generate_code_rec_dataset_from_source.py
  
  # 대규모 생성 (Train 100,000, Val 20,000)
  python 24_generate_code_rec_dataset_from_source.py --train 100000 --val 20000
  
  # 한글 폰트 필수 + 한글 비율 50%
  python 24_generate_code_rec_dataset_from_source.py --require-korean --korean-min 0.5
  
  # 증강 비율 60%
  python 24_generate_code_rec_dataset_from_source.py --augment-prob 0.6
        """
    )
    
    parser.add_argument('--train', type=int, default=DEFAULT_TARGET_TRAIN_COUNT,
                        help=f'Train 데이터셋 크기 (기본: {DEFAULT_TARGET_TRAIN_COUNT:,})')
    parser.add_argument('--val', type=int, default=DEFAULT_TARGET_VAL_COUNT,
                        help=f'Val 데이터셋 크기 (기본: {DEFAULT_TARGET_VAL_COUNT:,})')
    parser.add_argument('--require-korean', action='store_true',
                        help='한글 지원 폰트 필수 (없으면 종료)')
    parser.add_argument('--korean-min', type=float, default=KOREAN_MIN_RATIO,
                        help=f'한글 최소 비율 (기본: {KOREAN_MIN_RATIO:.0%})')
    parser.add_argument('--korean-max', type=float, default=KOREAN_MAX_RATIO,
                        help=f'한글 최대 비율 (기본: {KOREAN_MAX_RATIO:.0%})')
    parser.add_argument('--augment-prob', type=float, default=AUGMENTATION_PROB,
                        help=f'증강 적용 확률 (기본: {AUGMENTATION_PROB:.0%})')
    
    # 버킷 비율 (고급 옵션)
    parser.add_argument('--bucket-word-short', type=float, default=BUCKET_RATIOS['word_short'],
                        help=f'word_short 버킷 비율 (기본: {BUCKET_RATIOS["word_short"]:.0%})')
    parser.add_argument('--bucket-phrase-short', type=float, default=BUCKET_RATIOS['phrase_short'],
                        help=f'phrase_short 버킷 비율 (기본: {BUCKET_RATIOS["phrase_short"]:.0%})')
    parser.add_argument('--bucket-line-mid', type=float, default=BUCKET_RATIOS['line_mid'],
                        help=f'line_mid 버킷 비율 (기본: {BUCKET_RATIOS["line_mid"]:.0%})')
    parser.add_argument('--bucket-line-long', type=float, default=BUCKET_RATIOS['line_long'],
                        help=f'line_long 버킷 비율 (기본: {BUCKET_RATIOS["line_long"]:.0%})')
    parser.add_argument('--bucket-symbol-focus', type=float, default=BUCKET_RATIOS['symbol_focus'],
                        help=f'symbol_focus 버킷 비율 (기본: {BUCKET_RATIOS["symbol_focus"]:.0%})')
    parser.add_argument('--bucket-korean-focus', type=float, default=BUCKET_RATIOS['korean_focus'],
                        help=f'korean_focus 버킷 비율 (기본: {BUCKET_RATIOS["korean_focus"]:.0%})')
    
    args = parser.parse_args()
    
    # 버킷 비율 딕셔너리 구성
    bucket_ratios = {
        'word_short': args.bucket_word_short,
        'phrase_short': args.bucket_phrase_short,
        'line_mid': args.bucket_line_mid,
        'line_long': args.bucket_line_long,
        'symbol_focus': args.bucket_symbol_focus,
        'korean_focus': args.bucket_korean_focus,
    }
    
    # 비율 합계 검증
    total_ratio = sum(bucket_ratios.values())
    if abs(total_ratio - 1.0) > 0.01:
        print(f"[WARN] 버킷 비율 합계가 1.0이 아닙니다 ({total_ratio:.2f}). 정규화합니다.")
        bucket_ratios = {k: v / total_ratio for k, v in bucket_ratios.items()}
    
    # 데이터셋 생성
    generate_dataset(
        target_train=args.train,
        target_val=args.val,
        require_korean_font=args.require_korean,
        korean_min_ratio=args.korean_min,
        korean_max_ratio=args.korean_max,
        augmentation_prob=args.augment_prob,
        bucket_ratios=bucket_ratios
    )


if __name__ == "__main__":
    main()

'3. 자습 & 메모(실전, 실습, 프로젝트) > 3-2 메모(실전, 프로젝트)' 카테고리의 다른 글

[MEMO] 실사용 모드 (0)	2026.02.08
[Memo] PaddleOCR v5 REC 파이프라인 재구축 회고 (0)	2026.01.04
[PaddleOCR] 학습용 REC 데이터셋 생성 스크립트 (0)	2026.01.02
[MEMO] PaddleOCR 코드 문법 인식 모델 학습 (0)	2026.01.01
[MEMO] 학습 데이터셋 정제 작업 메모 (0)	2025.12.31

코드 전체 예시

'3. 자습 & 메모(실전, 실습, 프로젝트) > 3-2 메모(실전, 프로젝트)' 카테고리의 다른 글

티스토리툴바