Spaces:

niloydebbarma
/

allycat

Runtime error

App Files Files Community

allycat / 2_process_files.py

niloydebbarma

Upload 50 files

a7d2416 verified 2 months ago

raw

history blame contribute delete

5.58 kB

	import os, sys
	import shutil
	from pathlib import Path
	from docling.document_converter import DocumentConverter
	import html2text
	import logging
	import hashlib
	from my_config import MY_CONFIG

	logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)

	def cleanup_duplicate_markdown_files(processed_dir):
	"""
	Remove duplicate markdown files based on content hash.
	Keeps the first file encountered for each unique content.
	"""
	processed_path = Path(processed_dir)
	md_files = list(processed_path.glob('*.md'))

	if not md_files:
	logger.info("No markdown files found for deduplication")
	return 0

	content_hashes = {}
	duplicates_removed = 0

	for md_file in md_files:
	try:
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()

	if content_hash in content_hashes:
	os.remove(md_file)
	duplicates_removed += 1
	logger.info(f"Removed duplicate: {md_file} (same content as {content_hashes[content_hash]})")
	else:
	content_hashes[content_hash] = md_file

	except Exception as e:
	logger.warning(f"Error processing {md_file} for deduplication: {e}")

	logger.info(f"✅ Deduplication complete. Removed {duplicates_removed} duplicate files")
	return duplicates_removed
	## --- end of cleanup_duplicate_markdown_files ---

	def process_files(crawl_dir, processed_dir):
	"""
	Process all files in the crawl directory and convert them to markdown.
	Uses html2text for HTML/HTM files and docling for PDFs and other documents.

	Args:
	crawl_dir (str): Directory containing files to process
	processed_dir (str): Directory to save processed markdown files
	"""

	input_path = Path(crawl_dir)
	input_files = list(input_path.glob('*'))
	logger.info (f"Found {len(input_files)} files to process in {input_path}")

	shutil.rmtree(processed_dir, ignore_errors=True)
	shutil.os.makedirs(processed_dir, exist_ok=True)
	logger.info (f"✅ Cleared processed data directory : {processed_dir}")

	# Initialize converters
	docling_converter = DocumentConverter(format_options={"preserve_links": True})
	html_converter = html2text.HTML2Text()
	html_converter.ignore_links = False
	html_converter.ignore_images = False

	files_processed = 0
	errors = 0
	file_type_stats = {}

	for input_file in input_files:
	file_ext = input_file.suffix.lower()
	markdown_content = None

	try:
	# Process HTML/HTM files with html2text
	if file_ext in ['.html', '.htm']:
	with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
	html_content = f.read()
	markdown_content = html_converter.handle(html_content)
	logger.debug(f"Converted HTML '{input_file}' with html2text")

	# Process TXT files directly
	elif file_ext == '.txt':
	with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
	markdown_content = f.read()
	logger.debug(f"Processed TXT '{input_file}' directly")

	# Process PDF and other documents with docling
	else:
	result = docling_converter.convert(input_file)
	markdown_content = result.document.export_to_markdown()
	logger.debug(f"Converted '{input_file}' with docling")

	# Save markdown file
	if markdown_content:
	md_file_name = os.path.join(processed_dir, f"{input_file.stem}.md")
	with open(md_file_name, "w", encoding="utf-8") as md_file:
	md_file.write(markdown_content)

	files_processed += 1
	file_type_stats[file_ext] = file_type_stats.get(file_ext, 0) + 1

	except Exception as e:
	errors += 1
	logger.warning(f"Error processing {input_file}: {e}")

	logger.info (f"✅ Processed {files_processed} files. Errors: {errors}")

	# Print file type statistics in compact dictionary format
	if file_type_stats:
	logger.info(f"📊 File type statistics: {dict(sorted(file_type_stats.items()))}")

	return files_processed, errors, file_type_stats
	## --- end of process_files ---

	def main():
	"""
	Main function to run the file processing pipeline.
	"""
	logger.info("🚀 Starting file processing pipeline")

	try:
	files_processed, errors, file_type_stats = process_files(MY_CONFIG.CRAWL_DIR, MY_CONFIG.PROCESSED_DATA_DIR)
	duplicates_removed = cleanup_duplicate_markdown_files(MY_CONFIG.PROCESSED_DATA_DIR)
	logger.info(f"✅ Final summary: {files_processed} files processed, {errors} errors, {duplicates_removed} duplicates removed")
	logger.info("✅ File processing pipeline completed successfully")
	return 0
	except Exception as e:
	logger.error(f"❌ File processing pipeline failed: {e}")
	return 1

	if __name__ == "__main__":
	sys.exit(main())