This repository has been archived by the owner on Jun 14, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_non_lyrical.py
12 lines (10 loc) · 6.67 KB
/
remove_non_lyrical.py
1
2
3
4
5
6
7
8
9
10
11
12
import re
def remove_non_lyrical_lines(input_file, output_file):
with open(input_file, 'r') as file, open(output_file, 'w') as outfile:
for line in file:
if re.match(r'^[\u0900-\u097F\u0980-\u09FF\u0A00-\u0A7F\u0A80-\u0AFF\u0B00-\u0B7F\u0B80-\u0BFF\u0C00-\u0C7F\u0C80-\u0CFF\u0D00-\u0D7F\u0D80-\u0DFF\u0E00-\u0E7F\u0E80-\u0EFF\u0F00-\u0F7F\u0F80-\u0FFF\u1000-\u107F\u1080-\u10FF\u1100-\u117F\u1180-\u11FF\u1200-\u127F\u1280-\u12FF\u1300-\u137F\u1380-\u13FF\u1400-\u147F\u1480-\u14FF\u1500-\u157F\u1580-\u15FF\u1600-\u167F\u1680-\u16FF\u1700-\u177F\u1780-\u17FF\u1800-\u187F\u1880-\u18FF\u1900-\u197F\u1980-\u19FF\u1A00-\u1A7F\u1A80-\u1AFF\u1B00-\u1B7F\u1B80-\u1BFF\u1C00-\u1C7F\u1C80-\u1CFF\u1D00-\u1D7F\u1D80-\u1DFF\u1E00-\u1E7F\u1E80-\u1EFF\u1F00-\u1F7F\u1F80-\u1FFF\u2000-\u207F\u2080-\u20FF\u2100-\u217F\u2180-\u21FF\u2200-\u227F\u2280-\u22FF\u2300-\u237F\u2380-\u23FF\u2400-\u247F\u2480-\u24FF\u2500-\u257F\u2580-\u25FF\u2600-\u267F\u2680-\u26FF\u2700-\u277F\u2780-\u27FF\u2800-\u287F\u2880-\u28FF\u2900-\u297F\u2980-\u29FF\u2A00-\u2A7F\u2A80-\u2AFF\u2B00-\u2B7F\u2B80-\u2BFF\u2C00-\u2C7F\u2C80-\u2CFF\u2D00-\u2D7F\u2D80-\u2DFF\u2E00-\u2E7F\u2E80-\u2EFF\u2F00-\u2F7F\u2F80-\u2FFF\u3000-\u307F\u3080-\u30FF\u3100-\u317F\u3180-\u31FF\u3200-\u327F\u3280-\u32FF\u3300-\u337F\u3380-\u33FF\u3400-\u347F\u3480-\u34FF\u3500-\u357F\u3580-\u35FF\u3600-\u367F\u3680-\u36FF\u3700-\u377F\u3780-\u37FF\u3800-\u387F\u3880-\u38FF\u3900-\u397F\u3980-\u39FF\u3A00-\u3A7F\u3A80-\u3AFF\u3B00-\u3B7F\u3B80-\u3BFF\u3C00-\u3C7F\u3C80-\u3CFF\u3D00-\u3D7F\u3D80-\u3DFF\u3E00-\u3E7F\u3E80-\u3EFF\u3F00-\u3F7F\u3F80-\u3FFF\u4000-\u407F\u4080-\u40FF\u4100-\u417F\u4180-\u41FF\u4200-\u427F\u4280-\u42FF\u4300-\u437F\u4380-\u43FF\u4400-\u447F\u4480-\u44FF\u4500-\u457F\u4580-\u45FF\u4600-\u467F\u4680-\u46FF\u4700-\u477F\u4780-\u47FF\u4800-\u487F\u4880-\u48FF\u4900-\u497F\u4980-\u49FF\u4A00-\u4A7F\u4A80-\u4AFF\u4B00-\u4B7F\u4B80-\u4BFF\u4C00-\u4C7F\u4C80-\u4CFF\u4D00-\u4D7F\u4D80-\u4DFF\u4E00-\u4E7F\u4E80-\u4EFF\u4F00-\u4F7F\u4F80-\u4FFF\u5000-\u507F\u5080-\u50FF\u5100-\u517F\u5180-\u51FF\u5200-\u527F\u5280-\u52FF\u5300-\u537F\u5380-\u53FF\u5400-\u547F\u5480-\u54FF\u5500-\u557F\u5580-\u55FF\u5600-\u567F\u5680-\u56FF\u5700-\u577F\u5780-\u57FF\u5800-\u587F\u5880-\u58FF\u5900-\u597F\u5980-\u59FF\u5A00-\u5A7F\u5A80-\u5AFF\u5B00-\u5B7F\u5B80-\u5BFF\u5C00-\u5C7F\u5C80-\u5CFF\u5D00-\u5D7F\u5D80-\u5DFF\u5E00-\u5E7F\u5E80-\u5EFF\u5F00-\u5F7F\u5F80-\u5FFF\u6000-\u607F\u6080-\u60FF\u6100-\u617F\u6180-\u61FF\u6200-\u627F\u6280-\u62FF\u6300-\u637F\u6380-\u63FF\u6400-\u647F\u6480-\u64FF\u6500-\u657F\u6580-\u65FF\u6600-\u667F\u6680-\u66FF\u6700-\u677F\u6780-\u67FF\u6800-\u687F\u6880-\u68FF\u6900-\u697F\u6980-\u69FF\u6A00-\u6A7F\u6A80-\u6AFF\u6B00-\u6B7F\u6B80-\u6BFF\u6C00-\u6C7F\u6C80-\u6CFF\u6D00-\u6D7F\u6D80-\u6DFF\u6E00-\u6E7F\u6E80-\u6EFF\u6F00-\u6F7F\u6F80-\u6FFF\u7000-\u707F\u7080-\u70FF\u7100-\u717F\u7180-\u71FF\u7200-\u727F\u7280-\u72FF\u7300-\u737F\u7380-\u73FF\u7400-\u747F\u7480-\u74FF\u7500-\u757F\u7580-\u75FF\u7600-\u767F\u7680-\u76FF\u7700-\u777F\u7780-\u77FF\u7800-\u787F\u7880-\u78FF\u7900-\u797F\u7980-\u79FF\u7A00-\u7A7F\u7A80-\u7AFF\u7B00-\u7B7F\u7B80-\u7BFF\u7C00-\u7C7F\u7C80-\u7CFF\u7D00-\u7D7F\u7D80-\u7DFF\u7E00-\u7E7F\u7E80-\u7EFF\u7F00-\u7F7F\u7F80-\u7FFF\u8000-\u807F\u8080-\u80FF\u8100-\u817F\u8180-\u81FF\u8200-\u827F\u8280-\u82FF\u8300-\u837F\u8380-\u83FF\u8400-\u847F\u8480-\u84FF\u8500-\u857F\u8580-\u85FF\u8600-\u867F\u8680-\u86FF\u8700-\u877F\u8780-\u87FF\u8800-\u887F\u8880-\u88FF\u8900-\u897F\u8980-\u89FF\u8A00-\u8A7F\u8A80-\u8AFF\u8B00-\u8B7F\u8B80-\u8BFF\u8C00-\u8C7F\u8C80-\u8CFF\u8D00-\u8D7F\u8D80-\u8DFF\u8E00-\u8E7F\u8E80-\u8EFF\u8F00-\u8F7F\u8F80-\u8FFF\u9000-\u907F\u9080-\u90FF\u9100-\u917F\u9180-\u91FF\u9200-\u927F\u9280-\u92FF\u9300-\u937F\u9380-\u93FF\u9400-\u947F\u9480-\u94FF\u9500-\u957F\u9580-\u95FF\u9600-\u967F\u9680-\u96FF\u9700-\u977F\u9780-\u97FF\u9800-\u987F\u9880-\u98FF\u9900-\u997F\u9980-\u99FF\u9A00-\u9A7F\u9A80-\u9AFF\u9B00-\u9B7F\u9B80-\u9BFF\u9C00-\u9C7F\u9C80-\u9CFF\u9D00-\u9D7F\u9D80-\u9DFF\u9E00-\u9E7F\u9E80-\u9EFF\u9F00-\u9F7F\u9F80-\u9FFF\uA000-\uA07F\uA080-\uA0FF\uA100-\uA17F\uA180-\uA1FF\uA200-\uA27F\uA280-\uA2FF\uA300-\uA37F\uA380-\uA3FF\uA400-\uA47F\uA480-\uA4FF\uA500-\uA57F\uA580-\uA5FF\uA600-\uA67F\uA680-\uA6FF\uA700-\uA77F\uA780-\uA7FF\uA800-\uA87F\uA880-\uA8FF\uA900-\uA97F\uA980-\uA9FF\uAA00-\uAA7F\uAA80-\uAAFF\uAB00-\uAB7F\uAB80-\uABFF\uAC00-\uAC7F\uAC80-\uACFF\uAD00-\uAD7F\uAD80-\uADFF\uAE00-\uAE7F\uAE80-\uAEFF\uAF00-\uAF7F\uAF80-\uAFFF\uB000-\uB07F\uB080-\uB0FF\uB100-\uB17F\uB180-\uB1FF\uB200-\uB27F\uB280-\uB2FF\uB300-\uB37F\uB380-\uB3FF\uB400-\uB47F\uB480-\uB4FF\uB500-\uB57F\uB580-\uB5FF\uB600-\uB67F\uB680-\uB6FF\uB700-\uB77F\uB780-\uB7FF\uB800-\uB87F\uB880-\uB8FF\uB900-\uB97F\uB980-\uB9FF\uBA00-\uBA7F\uBA80-\uBAFF\uBB00-\uBB7F\uBB80-\uBBFF\uBC00-\uBC7F\uBC80-\uBCFF\uBD00-\uBD7F\uBD80-\uBDFF\uBE00-\uBE7F\uBE80-\uBEFF\uBF00-\uBF7F\uBF80-\uBFFF\uC000-\uC07F\uC080-\uC0FF\uC100-\uC17F\uC180-\uC1FF\uC200-\uC27F\uC280-\uC2FF\uC300-\uC37F\uC380-\uC3FF\uC400-\uC47F\uC480-\uC4FF\uC500-\uC57F\uC580-\uC5FF\uC600-\uC67F\uC680-\uC6FF\uC700-\uC77F\uC780-\uC7FF\uC800-\uC87F\uC880-\uC8FF\uC900-\uC97F\uC980-\uC9FF\uCA00-\uCA7F\uCA80-\uCAFF\uCB00-\uCB7F\uCB80-\uCBFF\uCC00-\uCC7F\uCC80-\uCCFF\uCD00-\uCD7F\uCD80-\uCDFF\uCE00-\uCE7F\uCE80-\uCEFF\uCF00-\uCF7F\uCF80-\uCFFF\uD000-\uD07F\uD080-\uD0FF\uD100-\uD17F\uD180-\uD1FF\uD200-\uD27F\uD280-\uD2FF\uD300-\uD37F\uD380-\uD3FF\uD400-\uD47F\uD480-\uD4FF\uD500-\uD57F\uD580-\uD5FF\uD600-\uD67F\uD680-\uD6FF\uD700-\uD77F\uD780-\uD7FF\uD800-\uD87F\uD880-\uD8FF\uD900-\uD97F\uD980-\uD9FF\uDA00-\uDA7F\uDA80-\uDAFF\uDB00-\uDB7F\uDB80-\uDBFF\uDC00-\uDC7F\uDC80-\uDCFF\uDD00-\uDD7F\uDD80-\uDDFF\uDE00-\uDE7F\uDE80-\uDEFF\uDF00-\uDF7F\uDF80-\uDFFF\uE000-\uE07F\uE080-\uE0FF\uE100-\uE17F\uE180-\uE1FF\uE200-\uE27F\uE280-\uE2FF\uE300-\uE37F\uE380-\uE3FF\uE400-\uE47F\uE480-\uE4FF\uE500-\uE57F\uE580-\uE5FF\uE600-\uE67F\uE680-\uE6FF\uE700-\uE77F\uE780-\uE7FF\uE800-\uE87F\uE880-\uE8FF\uE900-\uE97F\uE980-\uE9FF\uEA00-\uEA7F\uEA80-\uEAFF\uEB00-\uEB7F\uEB80-\uEBFF\uEC00-\uEC7F\uEC80-\uECFF\uED00-\uED7F\uED80-\uEDFF\uEE00-\uEE7F\uEE80-\uEEFF\uEF00-\uEF7F\uEF80-\uEFFF\uF000-\uF07F\uF080-\uF0FF\uF100-\uF17F\uF180-\uF1FF\uF200-\uF27F\uF280-\uF2FF\uF300-\uF37F\uF380-\uF3FF\uF400-\uF47F\uF480-\uF4FF\uF500-\uF57F\uF580-\uF5FF\uF600-\uF67F\uF680-\uF6FF\uF700-\uF77F\uF780-\uF7FF\uF800-\uF87F\uF880-\uF8FF\uF900-\uF97F\uF980-\uF9FF\uFA00-\uFA7F\uFA80-\uFAFF\uFB00-\uFB7F\uFB80-\uFBFF\uFC00-\uFC7F\uFC80-\uFCFF\uFD00-\uFD7F\uFD80-\uFDFF\uFE00-\uFE7F\uFE80-\uFEFF\uFF00-\uFF7F\uFF80-\uFFFF]', line):
outfile.write(line)
if __name__ == "__main__":
input_file = 'cleaned_spb_texts.txt'
output_file = 'final_cleaned_spb_texts.txt'
remove_non_lyrical_lines(input_file, output_file)