car_wei_zhang.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. """车辆违章数据处理
  2. """
  3. import re
  4. import subprocess
  5. from datetime import datetime
  6. from dateutil.relativedelta import relativedelta
  7. from loguru import logger
  8. import pandas as pd
  9. import psycopg
  10. import paramiko
  11. # 添加日志记录,将日志输出到文件 a.log
  12. logger.add(sink='a.log')
  13. ssh_hostname = '172.16.107.4' # 定义远程主机地址
  14. ssh_port = 22 # 定义SSH服务的端口号
  15. ssh_username = 'app' # 定义登录远程主机的用户名
  16. ssh_password = '(l4w0ST_' # 定义登录远程主机的密码
  17. # 服务器文件夹路径
  18. remote_dir_path = '/data/history/car/wei-zhang/'
  19. # 数据库连接信息
  20. db_host = "172.16.107.5" # 数据库主机地址
  21. db_port = 5432 # 数据库端口号
  22. db_username = "finance" # 数据库用户名
  23. db_password = "Finance@unicom23" # 数据库密码
  24. dbname = "financialdb" # 数据库名称
  25. conn_info= f"host='{db_host}' port={db_port} user='{db_username}' password='{db_password}' dbname='{dbname}'"
  26. # 获取当前日期,并计算上个月的第一天
  27. today = datetime.today()
  28. start_date = today - relativedelta(months=1, day=1)
  29. year_month = start_date.strftime('%Y%m')
  30. # 数据文件路径
  31. input_path = 'data.xlsx'
  32. # 输出文件路径
  33. output_path = 'output.csv'
  34. def data_process():
  35. # 正则表达式匹配车牌省份简称(如京、津、晋等)
  36. has_che_pai_province_pattern = re.compile(
  37. "[" + re.escape("京津晋冀蒙辽吉黑沪苏浙皖闽赣鲁豫鄂湘粤桂琼渝川贵云藏陕甘青宁国防") + "]")
  38. # 正则表达式匹配非车牌字符,排除车牌可能包含的字符(如字母、数字、特殊标志等)
  39. not_che_pai_pattern = re.compile(
  40. "[^京津晋冀蒙辽吉黑沪苏浙皖闽赣鲁豫鄂湘粤桂琼渝川贵云藏陕甘青宁新港澳学挂领试超练警国防A-Z\\d]")
  41. # 正则表达式匹配完整的车牌号格式
  42. che_pai_pattern = re.compile(
  43. r"([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z][A-Z]"
  44. r"(([DF]((?![IO])[A-Z0-9](?![IO]))\d{4})|(\d{5}[DF]))|"
  45. r"[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z][A-Z][A-Z0-9]{4}[A-Z0-9挂学警港澳])"
  46. )
  47. # 定义二级行政区划映射表(地级市及其下属区县)
  48. er_ji_map = {
  49. "石家庄": ["鹿泉", "藁城", "栾城", "井陉矿区", "井陉", "无极", "正定", "元氏", "新乐", "晋州", "平山", "灵寿",
  50. "赞皇", "赵县", "行唐", "高邑", "辛集", "深泽"],
  51. "唐山": ["唐山高开区", "迁西", "海港", "开平", "丰南", "滦县", "乐亭", "丰润", "玉田", "古冶", "曹妃甸", "遵化",
  52. "滦南", "迁安"],
  53. "秦皇岛": ["北戴河新区", "北戴河", "山海关", "昌黎", "卢龙", "青龙", "抚宁"],
  54. "邯郸": ["曲周", "魏县", "馆陶", "磁县", "大名", "鸡泽", "成安", "涉县", "永年", "武安", "峰峰", "广平", "临漳",
  55. "邱县", "肥乡"],
  56. "邢台": ["新河", "南宫", "隆尧", "内邱", "平乡", "宁晋", "广宗", "清河", "临西", "任县", "巨鹿", "沙河", "威县",
  57. "临城", "柏乡", "南和"],
  58. "保定": ["涞水", "蠡县", "顺平", "博野", "安国", "涞源", "唐县", "定州", "高阳", "曲阳", "阜平", "清苑",
  59. "高碑店",
  60. "满城", "涿州", "易县", "望都", "徐水", "定兴", "白沟"],
  61. "张家口": ["张北", "崇礼", "康保", "赤城", "阳原", "万全", "下花园", "尚义", "怀安", "怀来", "蔚县", "涿鹿",
  62. "沽源",
  63. "宣化"],
  64. "承德": ["承德县", "兴隆", "宽城", "平泉", "营子", "隆化", "滦平", "围场", "丰宁", "双滦"],
  65. "廊坊": ["文安", "霸州", "大城", "廊坊开发区", "三河", "香河", "永清", "胜芳", "燕郊", "固安", "大厂"],
  66. "沧州": ["东光", "吴桥", "黄骅", "盐山", "孟村", "泊头", "献县", "南皮", "渤海新区", "海兴", "沧县", "河间",
  67. "青县",
  68. "任丘", "肃宁"],
  69. "衡水": ["景县", "阜城", "枣强", "深州", "饶阳", "故城", "武强", "武邑", "冀州", "安平"],
  70. "雄安": ["容城", "雄县", "安新"]
  71. }
  72. # 初始化组织结构映射表
  73. org_map = {}
  74. third_org_map = {}
  75. third_org_list_map = {}
  76. area_map = {}
  77. district_list_map = {}
  78. # 连接PostgreSQL数据库
  79. with psycopg.connect(
  80. conninfo=conn_info,
  81. row_factory=psycopg.rows.dict_row
  82. ) as conn:
  83. with conn.cursor() as curs:
  84. # 查询一级组织数据,并按order_num排序
  85. sql = """
  86. select * from common.organization where grade = 1 order by order_num
  87. """
  88. logger.info(f"sql: {sql}")
  89. curs.execute(sql)
  90. second_orgs = curs.fetchall()
  91. # 遍历一级组织数据,构建org_map和third_org_list_map
  92. for x in second_orgs:
  93. org_map[x['id']] = x
  94. third_org_list_map[x['id']] = []
  95. # 查询二级组织数据,并按parent_id和order_num排序
  96. sql = """
  97. select * from common.organization where grade = 2 order by parent_id, order_num
  98. """
  99. logger.info(f"sql: {sql}")
  100. curs.execute(sql)
  101. third_orgs = curs.fetchall()
  102. # 遍历二级组织数据,构建org_map、third_org_list_map和third_org_map
  103. for x in third_orgs:
  104. org_map[x['id']] = x
  105. third_org_list_map[x['parent_id']].append(x)
  106. third_org_map[x['id']] = x
  107. # 查询一级行政区划数据,并按area_id排序
  108. sql = """
  109. select * from common.area where area_grade = 1 order by area_id
  110. """
  111. logger.info(f"sql: {sql}")
  112. curs.execute(sql)
  113. cities = curs.fetchall()
  114. # 遍历一级行政区划数据,构建area_map
  115. for city in cities:
  116. area_map[city['area_id']] = city
  117. # 查询二级行政区划数据,并按parent_id和area_id排序
  118. sql = """
  119. select * from common.area where area_grade = 2 order by parent_id, area_id
  120. """
  121. logger.info(f"sql: {sql}")
  122. curs.execute(sql)
  123. districts = curs.fetchall()
  124. # 遍历二级行政区划数据,构建area_map和district_list_map
  125. for district in districts:
  126. area_map[district['area_id']] = district
  127. # 构建城市与区县的映射关系
  128. for city in cities:
  129. district_list_map[city['area_id']] = []
  130. for district in districts:
  131. if city['area_id'] == district['parent_id']:
  132. district_list_map[city['area_id']].append(district)
  133. # 读取 Excel 文件中的数据
  134. df = pd.read_excel(io=input_path)
  135. # 获取需要清理的列名列表,排除 "违章时间" 和 "处理时间" 列
  136. columns_to_clean = list(filter(lambda x: x not in ('违章时间', '处理时间'), df.columns))
  137. # 对需要清理的列进行字符串清理,移除多余的空白字符
  138. df[columns_to_clean] = df[columns_to_clean].map(lambda x: re.sub(r'\s+', '', x) if type(x) is str else x)
  139. df['账期'] = year_month
  140. # 保存原始单位和车牌号信息到新的列中
  141. df['原始一级单位'] = df['一级单位']
  142. df['原始二级单位'] = df['二级单位']
  143. df['原始三级单位'] = df['三级单位']
  144. df['原始车牌号'] = df['车牌号']
  145. # 定义函数,用于提取并标准化车牌号
  146. def get_che_pai(che_pai):
  147. # 如果车牌号为空或无效,则返回空字符串
  148. if pd.isna(che_pai) or not che_pai or not che_pai.strip():
  149. return ""
  150. # 将车牌号转换为大写
  151. upper_case = che_pai.upper()
  152. # 移除车牌号中不符合规则的字符
  153. s = not_che_pai_pattern.sub("", upper_case)
  154. # 使用正则表达式匹配合法的车牌号
  155. m = che_pai_pattern.search(s)
  156. if m:
  157. return m.group(0)
  158. # 如果车牌号包含省份简称但未匹配成功,记录警告日志
  159. if has_che_pai_province_pattern.search(che_pai):
  160. logger.warning(f"车牌匹配失败: {che_pai} -> {s}")
  161. return s
  162. # 如果完全无法匹配,记录警告日志并返回原车牌号
  163. logger.warning(f"车牌匹配失败: {che_pai} -> {upper_case}")
  164. return upper_case
  165. # 应用 get_che_pai 函数处理车牌号列
  166. df['车牌号'] = df['车牌号'].apply(get_che_pai)
  167. # 定义函数,用于标记车牌号是否匹配失败
  168. def che_pai_fail(che_pai):
  169. # 如果车牌号为空或无效,则标记为失败
  170. if pd.isna(che_pai) or not che_pai or not che_pai.strip():
  171. return "1"
  172. # 移除车牌号中不符合规则的字符
  173. s = not_che_pai_pattern.sub("", che_pai.upper())
  174. # 使用正则表达式匹配合法的车牌号
  175. m = che_pai_pattern.search(s)
  176. if m:
  177. return "0" # 匹配成功
  178. return "1" # 匹配失败
  179. # 应用 che_pai_fail 函数生成车牌匹配失败标记列
  180. df['车牌匹配失败'] = df['车牌号'].apply(che_pai_fail)
  181. # 定义函数,用于提取一级单位
  182. def get_first_unit(unit):
  183. # 如果单位为空或无效,则返回空字符串
  184. if pd.isna(unit) or not unit or not unit.strip():
  185. return ""
  186. # 根据单位名称中的关键词返回对应的一级单位
  187. if "机动通信局" in unit or "机动局" in unit or "传输局" in unit or "线路维护中心" in unit:
  188. return "机动局"
  189. if "雄安基地建设部" in unit:
  190. return "雄安基地建设部"
  191. if "华北基地建设部" in unit:
  192. return "华北基地建设部"
  193. # 遍历 er_ji_map 的键,寻找匹配的一级单位
  194. for yj in er_ji_map.keys():
  195. if yj in unit:
  196. return yj
  197. return "省公司本部" # 默认返回省公司本部
  198. # 应用 get_first_unit 函数生成一级单位列
  199. df['一级单位'] = df['原始一级单位'].apply(get_first_unit)
  200. # 定义函数,用于提取二级单位
  201. def get_second_unit(x):
  202. # 获取一级单位和原始二级单位
  203. first_unit = str(x['一级单位']) if pd.notna(x['一级单位']) else ""
  204. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  205. # 如果二级单位为空或无效,则返回一级单位
  206. if not unit or not unit.strip():
  207. return first_unit
  208. # 如果一级单位是省公司本部,则返回省公司本部
  209. if first_unit == "省公司本部":
  210. return first_unit
  211. # 如果一级单位是机动局,则根据单位名称进一步细化
  212. if first_unit == "机动局":
  213. for yj in er_ji_map.keys():
  214. if yj in unit:
  215. return f"机动局{yj}"
  216. return "机动局本部"
  217. # 根据特定城市和关键词返回对应的二级单位
  218. if first_unit == "石家庄":
  219. if "开发区" in unit:
  220. return "石家庄开发区"
  221. if first_unit == "廊坊":
  222. if "开发区" in unit:
  223. return "廊坊开发区"
  224. if first_unit == "邢台":
  225. if "内丘" in unit:
  226. return "内邱"
  227. if "任泽" in unit:
  228. return "任县"
  229. if first_unit == "唐山":
  230. if "高开区" in unit:
  231. return "唐山高开区"
  232. if "滦州" in unit:
  233. return "滦县"
  234. # 根据 er_ji_map 获取二级单位
  235. ejs = er_ji_map.get(first_unit)
  236. if not ejs:
  237. return first_unit
  238. if first_unit == "雄安":
  239. unit = unit.replace("雄安新区", "")
  240. for ej in ejs:
  241. if ej in unit:
  242. return ej
  243. return f"{first_unit}本部" # 默认返回一级单位本部
  244. # 应用 get_second_unit 函数生成二级单位列
  245. df['二级单位'] = df.apply(get_second_unit, axis=1)
  246. # 定义函数,用于提取三级单位
  247. def get_third_unit(x):
  248. # 获取二级单位和原始三级单位
  249. second_unit = str(x['二级单位']) if pd.notna(x['二级单位']) else ""
  250. unit = str(x['原始三级单位']) if pd.notna(x['原始三级单位']) else ""
  251. # 如果三级单位为空或无效,则返回二级单位
  252. if not unit or not unit.strip():
  253. return second_unit
  254. # 按下划线分割三级单位名称
  255. a = unit.split("_")
  256. if len(a) == 1:
  257. return unit
  258. if len(a) < 4:
  259. return second_unit
  260. return a[3] # 返回分割后的第四个部分作为三级单位
  261. # 应用 get_third_unit 函数生成三级单位列
  262. df['三级单位'] = df.apply(get_third_unit, axis=1)
  263. # 定义一个函数,用于根据单位名称获取二级组织机构编码
  264. def get_area_no(unit):
  265. # 如果单位为空或无效,则返回空字符串
  266. if pd.isna(unit) or not unit or not unit.strip():
  267. return ""
  268. # 如果单位包含特定关键词(如“机动通信局”等),返回固定编码"-11"
  269. if any(keyword in unit for keyword in ["机动通信局", "机动局", "传输局", "线路维护中心"]):
  270. return "-11"
  271. # 如果单位包含特定关键词(如“省公司本部”等),返回固定编码"-12"
  272. if any(keyword in unit for keyword in ["省公司本部", "雄安基地建设部", "华北基地建设部"]):
  273. return "-12"
  274. # 遍历second_orgs列表,匹配单位名称并返回对应的id
  275. for second_org in second_orgs:
  276. if second_org.get('name') in unit:
  277. return second_org.get('id')
  278. # 如果未匹配到任何规则,返回默认编码"-12"
  279. return "-12"
  280. # 将get_area_no函数应用到DataFrame的'原始一级单位'列,生成'二级组织机构编码'列
  281. df['二级组织机构编码'] = df['原始一级单位'].apply(get_area_no)
  282. # 定义一个函数,用于根据组织机构编码获取组织机构名称
  283. def get_org_name(org_no):
  284. # 如果编码为空或无效,则返回空字符串
  285. if pd.isna(org_no) or not org_no or not org_no.strip():
  286. return ""
  287. # 在org_map中查找对应编码的组织机构信息,并返回其名称
  288. po = org_map.get(org_no)
  289. if po is not None:
  290. return po.get('name')
  291. return ""
  292. # 将get_org_name函数应用到'二级组织机构编码'列,生成'二级组织机构名称'列
  293. df['二级组织机构名称'] = df['二级组织机构编码'].apply(get_org_name)
  294. # 定义一个函数,用于根据行数据获取三级组织机构编码
  295. def get_city_no(x):
  296. # 获取相关字段值,如果为空则设置为""
  297. area_no = str(x['二级组织机构编码']) if pd.notna(x['二级组织机构编码']) else ""
  298. area_name = str(x['二级组织机构名称']) if pd.notna(x['二级组织机构名称']) else ""
  299. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  300. # 如果二级组织机构编码或名称为空,则返回""
  301. if not area_no or not area_name:
  302. return ""
  303. # 根据不同的二级组织机构名称和单位内容,返回对应的三级组织机构编码
  304. if area_name == "石家庄":
  305. if "井陉矿区" in unit:
  306. return "D0130185"
  307. if "井陉" in unit:
  308. return "D0130121"
  309. if area_name == "秦皇岛":
  310. if "北戴河新区" in unit:
  311. return "D0130325"
  312. if "北戴河" in unit:
  313. return "D0130304"
  314. if area_name == "邯郸":
  315. if "峰峰" in unit:
  316. return "D0130406"
  317. if area_name == "邢台":
  318. if "内丘" in unit:
  319. return "D0130523"
  320. if "任泽" in unit:
  321. return "D0130526"
  322. if area_name == "省机动局":
  323. if "沧州" in unit:
  324. return "HECS180"
  325. if "唐山" in unit:
  326. return "HECS181"
  327. if "秦皇岛" in unit:
  328. return "HECS182"
  329. if "廊坊" in unit:
  330. return "HECS183"
  331. if "张家口" in unit:
  332. return "HECS184"
  333. if "邢台" in unit:
  334. return "HECS185"
  335. if "邯郸" in unit:
  336. return "HECS186"
  337. if "保定" in unit:
  338. return "HECS187"
  339. if "石家庄" in unit:
  340. return "HECS188"
  341. if "承德" in unit:
  342. return "HECS189"
  343. if "衡水" in unit:
  344. return "HECS720"
  345. if "雄安" in unit:
  346. return "HECS728"
  347. return "HECS018"
  348. if area_name == "雄安":
  349. unit = unit.replace("雄安新区", "")
  350. l3 = third_org_list_map.get(area_no, [])
  351. for organization_po in l3:
  352. if organization_po.get('name') in unit:
  353. return organization_po.get('id')
  354. if area_name == "沧州":
  355. return "D0130911"
  356. if area_name == "唐山":
  357. return "D0130202"
  358. if area_name == "秦皇岛":
  359. return "D0130302"
  360. if area_name == "廊坊":
  361. return "D0131000"
  362. if area_name == "张家口":
  363. return "D0130701"
  364. if area_name == "邢台":
  365. return "D0130502"
  366. if area_name == "邯郸":
  367. return "D0130402"
  368. if area_name == "保定":
  369. return "D0130601"
  370. if area_name == "石家庄":
  371. return "D0130186"
  372. if area_name == "承德":
  373. return "D0130801"
  374. if area_name == "衡水":
  375. return "D0133001"
  376. if area_name == "雄安":
  377. return "D0130830"
  378. return "HE001"
  379. # 将get_city_no函数应用到DataFrame的每一行,生成'三级组织机构编码'列
  380. df['三级组织机构编码'] = df.apply(get_city_no, axis=1)
  381. # 将get_org_name函数应用到'三级组织机构编码'列,生成'三级组织机构名称'列
  382. df['三级组织机构名称'] = df['三级组织机构编码'].apply(get_org_name)
  383. # 定义一个函数,用于根据行数据获取二级组织机构编码2
  384. def get_area_no2(x):
  385. # 获取相关字段值,如果为空则设置为""
  386. area_name = str(x['二级组织机构名称']) if pd.notna(x['二级组织机构名称']) else ""
  387. city_name = str(x['三级组织机构名称']) if pd.notna(x['三级组织机构名称']) else ""
  388. # 如果二级组织机构名称为空,则返回""
  389. if not area_name or not area_name.strip():
  390. return ""
  391. # 根据二级组织机构名称和三级组织机构名称的内容,返回对应的编码
  392. if area_name == "省机动局" and city_name and city_name.strip():
  393. if "沧州" in city_name:
  394. return "180"
  395. if "唐山" in city_name:
  396. return "181"
  397. if "秦皇岛" in city_name:
  398. return "182"
  399. if "廊坊" in city_name:
  400. return "183"
  401. if "张家口" in city_name:
  402. return "184"
  403. if "邢台" in city_name:
  404. return "185"
  405. if "邯郸" in city_name:
  406. return "186"
  407. if "保定" in city_name:
  408. return "187"
  409. if "石家庄" in city_name:
  410. return "188"
  411. if "承德" in city_name:
  412. return "189"
  413. if "衡水" in city_name:
  414. return "720"
  415. if "雄安" in city_name:
  416. return "782"
  417. if "沧州" in area_name:
  418. return "180"
  419. if "唐山" in area_name:
  420. return "181"
  421. if "秦皇岛" in area_name:
  422. return "182"
  423. if "廊坊" in area_name:
  424. return "183"
  425. if "张家口" in area_name:
  426. return "184"
  427. if "邢台" in area_name:
  428. return "185"
  429. if "邯郸" in area_name:
  430. return "186"
  431. if "保定" in area_name:
  432. return "187"
  433. if "石家庄" in area_name:
  434. return "188"
  435. if "承德" in area_name:
  436. return "189"
  437. if "衡水" in area_name:
  438. return "720"
  439. if "雄安" in area_name:
  440. return "782"
  441. return ""
  442. # 将get_area_no2函数应用到DataFrame的每一行,生成'二级组织机构编码2'列
  443. df['二级组织机构编码2'] = df.apply(get_area_no2, axis=1)
  444. # 将get_org_name函数应用到'二级组织机构编码2'列,生成'二级组织机构名称2'列
  445. df['二级组织机构名称2'] = df['二级组织机构编码2'].apply(get_org_name)
  446. # 定义一个函数,用于根据单位名称获取城市ID
  447. def get_city_id(unit):
  448. # 如果单位为空或无效,则返回""
  449. if pd.isna(unit) or not unit or not unit.strip():
  450. return ""
  451. # 遍历cities列表,匹配单位名称并返回对应的城市ID
  452. for city in cities:
  453. if city.get('short_name') and city['short_name'] in unit:
  454. return city.get('area_id', "")
  455. return ""
  456. # 将get_city_id函数应用到'原始一级单位'列,生成'city_id'列
  457. df['city_id'] = df['原始一级单位'].apply(get_city_id)
  458. # 定义一个函数,用于根据ID获取区域名称
  459. def get_area_name(id):
  460. # 如果ID为空或无效,则返回""
  461. if pd.isna(id) or not id or not id.strip():
  462. return ""
  463. # 在area_map中查找对应ID的区域信息,并返回其名称
  464. area_po = area_map.get(id)
  465. if area_po is not None:
  466. return area_po.get("area_name", "")
  467. return ""
  468. # 将get_area_name函数应用到'city_id'列,生成'city'列
  469. df['city'] = df['city_id'].apply(get_area_name)
  470. # 定义一个函数,用于根据行数据获取区县ID
  471. def get_district_id(x):
  472. # 获取相关字段值,如果为空则设置为""
  473. city_id = str(x['city_id']) if pd.notna(x['city_id']) else ""
  474. city = str(x['city']) if pd.notna(x['city']) else ""
  475. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  476. # 如果城市ID、城市名称或单位为空,则返回""
  477. if not city_id or not city or not unit:
  478. return ""
  479. # 根据城市名称和单位内容,返回对应的区县ID
  480. if city == "石家庄":
  481. if "井陉矿区" in unit:
  482. return "130107"
  483. if "井陉" in unit:
  484. return "130121"
  485. if city == "雄安":
  486. unit = unit.replace("雄安新区", "")
  487. districts = district_list_map.get(city_id)
  488. if not districts:
  489. return ""
  490. for district in districts:
  491. if district.get('short_name') in unit:
  492. return district.get('area_id')
  493. return ""
  494. # 将get_district_id函数应用到DataFrame的每一行,生成'district_id'列
  495. df['district_id'] = df.apply(get_district_id, axis=1)
  496. # 将get_area_name函数应用到'district_id'列,生成'district'列
  497. df['district'] = df['district_id'].apply(get_area_name)
  498. # 提取账期年份和月份信息
  499. df['year_no'] = df['账期'].apply(lambda x: None if pd.isna(x) else str(x)[:4])
  500. df['month_no'] = df['账期'].apply(lambda x: None if pd.isna(x) else str(x)[-2:])
  501. # 格式化违章时间和处理时间为年月格式
  502. df['违章年月'] = df['违章时间'].apply(lambda x: None if pd.isna(x) else pd.to_datetime(x).strftime('%Y%m'))
  503. df['处理年月'] = df['处理时间'].apply(lambda x: None if pd.isna(x) else pd.to_datetime(x).strftime('%Y%m'))
  504. # 打印DataFrame的信息
  505. print(df.info())
  506. # 将处理后的数据保存到CSV文件中
  507. df.to_csv(path_or_buf=output_path,
  508. header=['year_month', 'che_pai_hao', 'first_unit', 'second_unit', 'third_unit', 'che_jia_hao',
  509. 'wei_zhang_shi_jian', 'wei_zhang_di_dian', 'wei_zhang_xiang_qing', 'kou_fen', 'fa_kuan',
  510. 'chu_li_zhuang_tai', 'chu_li_shi_jian', 'wei_zhang_wei_chu_li_shi_chang', 'raw_yi_ji',
  511. 'raw_er_ji',
  512. 'raw_san_ji', 'raw_che_pai_hao', 'che_pai_fail', 'area_no', 'area_name', 'city_no', 'city_name',
  513. 'area_no2', 'area_name2', 'city_id', 'city', 'district_id', 'district', 'year_no', 'month_no',
  514. 'wei_zhang_nian_yue', 'chu_li_nian_yue'],
  515. index=False,
  516. encoding='utf-8-sig')
  517. def data_import():
  518. # 定义 PowerShell 脚本的路径
  519. script_path = r"../../copy.ps1"
  520. # 目标表和文件信息
  521. table = "car.car_wei_zhang" # 数据库目标表名
  522. # 表字段列名,用于指定导入数据的列顺序
  523. columns = "year_month,che_pai_hao,first_unit,second_unit,third_unit,che_jia_hao,wei_zhang_shi_jian,wei_zhang_di_dian,wei_zhang_xiang_qing,kou_fen,fa_kuan,chu_li_zhuang_tai,chu_li_shi_jian,wei_zhang_wei_chu_li_shi_chang,raw_yi_ji,raw_er_ji,raw_san_ji,raw_che_pai_hao,che_pai_fail,area_no,area_name,city_no,city_name,area_no2,area_name2,city_id,city,district_id,district,year_no,month_no,wei_zhang_nian_yue,chu_li_nian_yue"
  524. # 构造执行 PowerShell 脚本的命令
  525. command = f"powershell -File {script_path} -db_host {db_host} -db_port {db_port} -db_username {db_username} -db_password {db_password} -dbname {dbname} -table {table} -filename {output_path} -columns {columns}"
  526. # 打印生成的命令,方便调试和日志记录
  527. logger.info("command: {}", command)
  528. # 使用 subprocess 模块运行 PowerShell 命令,并捕获输出
  529. completed_process = subprocess.run(
  530. command, # 执行的命令
  531. check=False, # 如果命令执行失败,不抛出异常
  532. text=True, # 将输出作为字符串处理
  533. capture_output=True, # 捕获标准输出和标准错误
  534. )
  535. # 打印命令执行的结果,包括返回码、标准输出和标准错误
  536. logger.info("导入结果:\n{}\n{}\n{}", completed_process.returncode, completed_process.stdout, completed_process.stderr)
  537. # 定义正则表达式,用于匹配标准输出中的 COPY 结果
  538. p = re.compile(r"^(COPY) (\d+)$")
  539. count = None # 初始化计数变量
  540. matcher = p.match(completed_process.stdout) # 匹配标准输出中的 COPY 结果
  541. if matcher:
  542. count = int(matcher.group(2)) # 提取导入的数据行数
  543. # 如果没有成功提取到导入数据的行数,抛出运行时异常
  544. if count is None:
  545. raise RuntimeError("导入数据失败")
  546. def upload_file():
  547. remote_path = f'{remote_dir_path}{year_month}.xlsx' # 定义远程主机的目标文件路径
  548. # 使用paramiko.SSHClient创建一个SSH客户端对象,并通过with语句管理其上下文
  549. with paramiko.SSHClient() as ssh:
  550. # 设置自动添加主机密钥策略,避免因未知主机密钥导致连接失败
  551. ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
  552. # 连接到远程主机,传入主机地址、端口、用户名和密码
  553. ssh.connect(ssh_hostname, port=ssh_port, username=ssh_username, password=ssh_password)
  554. # 执行远程命令,创建远程目录(如果不存在)
  555. ssh.exec_command(f'mkdir -p {remote_dir_path}')
  556. # 打开SFTP会话,用于文件传输,并通过with语句管理其上下文
  557. with ssh.open_sftp() as sftp:
  558. # 记录日志,提示即将上传的本地文件和远程目标路径
  559. logger.info("upload {} to {}", input_path, remote_path)
  560. # 使用SFTP的put方法将本地文件上传到远程主机
  561. sftp.put(input_path, remote_path)
  562. # 记录日志,提示文件已成功上传
  563. logger.info("uploaded {}", input_path)
  564. def data_update():
  565. with psycopg.connect(
  566. conninfo=conn_info,
  567. ) as conn:
  568. with conn.cursor() as curs:
  569. # 插入违章长期未处理
  570. sql = f"""
  571. insert
  572. into
  573. car.car_wei_zhang_chang_qi
  574. (
  575. year_month,
  576. che_pai_hao,
  577. raw_yi_ji,
  578. raw_er_ji,
  579. raw_san_ji,
  580. wei_zhang_shi_jian,
  581. wei_zhang_di_dian,
  582. wei_zhang_xiang_qing,
  583. kou_fen,
  584. fa_kuan,
  585. wei_zhang_wei_chu_li_shi_chang,
  586. chu_li_zhuang_tai,
  587. first_unit,
  588. second_unit,
  589. third_unit,
  590. area_no,
  591. area_name,
  592. city_no,
  593. city_name,
  594. area_name2,
  595. area_no2,
  596. city_id,
  597. city,
  598. district_id,
  599. district,
  600. raw_che_pai_hao,
  601. che_pai_fail,
  602. wei_zhang_nian_yue,
  603. year_no,
  604. month_no,
  605. source
  606. )
  607. select
  608. year_month,
  609. che_pai_hao,
  610. raw_yi_ji,
  611. raw_er_ji,
  612. raw_san_ji,
  613. wei_zhang_shi_jian,
  614. wei_zhang_di_dian,
  615. wei_zhang_xiang_qing,
  616. kou_fen,
  617. fa_kuan,
  618. wei_zhang_wei_chu_li_shi_chang,
  619. chu_li_zhuang_tai,
  620. first_unit,
  621. second_unit,
  622. third_unit,
  623. area_no,
  624. area_name,
  625. city_no,
  626. city_name,
  627. area_name2,
  628. area_no2,
  629. city_id,
  630. city,
  631. district_id,
  632. district,
  633. raw_che_pai_hao,
  634. che_pai_fail,
  635. wei_zhang_nian_yue,
  636. year_no,
  637. month_no,
  638. source
  639. from
  640. car.car_wei_zhang
  641. where
  642. chu_li_zhuang_tai = '未处理'
  643. and wei_zhang_wei_chu_li_shi_chang > 150
  644. and year_month = {year_month}
  645. """
  646. logger.info(f"sql: {sql}")
  647. curs.execute(sql)
  648. logger.info(f"update {curs.rowcount}")
  649. # 插入违章
  650. sql = f"""
  651. insert
  652. into
  653. car_theme.wz_f_violation_details
  654. (
  655. statistical_month,
  656. card_num,
  657. city,
  658. dpt_sec,
  659. grid,
  660. violation_time,
  661. violation_location,
  662. violation_details,
  663. deduction_points,
  664. fine,
  665. processing_time,
  666. unprocessed_duration_of_violation,
  667. offline_actual_processing_status
  668. )
  669. select
  670. year_month,
  671. che_pai_hao,
  672. first_unit,
  673. second_unit,
  674. third_unit,
  675. wei_zhang_shi_jian,
  676. wei_zhang_di_dian,
  677. wei_zhang_xiang_qing,
  678. kou_fen,
  679. fa_kuan,
  680. chu_li_shi_jian,
  681. wei_zhang_wei_chu_li_shi_chang,
  682. chu_li_zhuang_tai
  683. from
  684. car.car_wei_zhang
  685. where
  686. year_month = {year_month}
  687. """
  688. logger.info(f"sql: {sql}")
  689. curs.execute(sql)
  690. logger.info(f"update {curs.rowcount}")
  691. data_process()
  692. data_import()
  693. upload_file()
  694. data_update()