car_chao-bao.py 30 KB


  1. """车辆超保数据处理
  2. """
  3. import re
  4. import subprocess
  5. from datetime import datetime
  6. from dateutil.relativedelta import relativedelta
  7. from loguru import logger
  8. import pandas as pd
  9. import psycopg
  10. import paramiko
  11. # 添加日志记录,将日志输出到文件 a.log
  12. logger.add(sink='a.log')
  13. ssh_hostname = '172.16.107.4' # 定义远程主机地址
  14. ssh_port = 22 # 定义SSH服务的端口号
  15. ssh_username = 'app' # 定义登录远程主机的用户名
  16. ssh_password = '(l4w0ST_' # 定义登录远程主机的密码
  17. # 服务器文件夹路径
  18. remote_dir_path = '/data/history/car/chao-bao/'
  19. # 数据库连接信息
  20. db_host = "172.16.107.5" # 数据库主机地址
  21. db_port = 5432 # 数据库端口号
  22. db_username = "finance" # 数据库用户名
  23. db_password = "Finance@unicom23" # 数据库密码
  24. dbname = "financialdb" # 数据库名称
  25. conn_info = f"host='{db_host}' port={db_port} user='{db_username}' password='{db_password}' dbname='{dbname}'"
  26. # 获取当前日期,并计算上个月的第一天
  27. today = datetime.today()
  28. start_date = today - relativedelta(months=1, day=1)
  29. year_month = start_date.strftime('%Y%m')
  30. # 数据文件路径
  31. input_path = 'data.xlsx'
  32. # 输出文件路径
  33. output_path = 'output.csv'
  34. def data_process():
  35. # 正则表达式匹配车牌省份简称(如京、津、晋等)
  36. has_che_pai_province_pattern = re.compile(
  37. "[" + re.escape("京津晋冀蒙辽吉黑沪苏浙皖闽赣鲁豫鄂湘粤桂琼渝川贵云藏陕甘青宁国防") + "]")
  38. # 正则表达式匹配非车牌字符,排除车牌可能包含的字符(如字母、数字、特殊标志等)
  39. not_che_pai_pattern = re.compile(
  40. "[^京津晋冀蒙辽吉黑沪苏浙皖闽赣鲁豫鄂湘粤桂琼渝川贵云藏陕甘青宁新港澳学挂领试超练警国防A-Z\\d]")
  41. # 正则表达式匹配完整的车牌号格式
  42. che_pai_pattern = re.compile(
  43. r"([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z][A-Z]"
  44. r"(([DF]((?![IO])[A-Z0-9](?![IO]))\d{4})|(\d{5}[DF]))|"
  45. r"[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z][A-Z][A-Z0-9]{4}[A-Z0-9挂学警港澳])"
  46. )
  47. # 定义二级行政区划映射表(地级市及其下属区县)
  48. er_ji_map = {
  49. "石家庄": ["鹿泉", "藁城", "栾城", "井陉矿区", "井陉", "无极", "正定", "元氏", "新乐", "晋州", "平山", "灵寿",
  50. "赞皇", "赵县", "行唐", "高邑", "辛集", "深泽"],
  51. "唐山": ["唐山高开区", "迁西", "海港", "开平", "丰南", "滦县", "乐亭", "丰润", "玉田", "古冶", "曹妃甸", "遵化",
  52. "滦南", "迁安"],
  53. "秦皇岛": ["北戴河新区", "北戴河", "山海关", "昌黎", "卢龙", "青龙", "抚宁"],
  54. "邯郸": ["曲周", "魏县", "馆陶", "磁县", "大名", "鸡泽", "成安", "涉县", "永年", "武安", "峰峰", "广平", "临漳",
  55. "邱县", "肥乡"],
  56. "邢台": ["新河", "南宫", "隆尧", "内邱", "平乡", "宁晋", "广宗", "清河", "临西", "任县", "巨鹿", "沙河", "威县",
  57. "临城", "柏乡", "南和"],
  58. "保定": ["涞水", "蠡县", "顺平", "博野", "安国", "涞源", "唐县", "定州", "高阳", "曲阳", "阜平", "清苑",
  59. "高碑店",
  60. "满城", "涿州", "易县", "望都", "徐水", "定兴", "白沟"],
  61. "张家口": ["张北", "崇礼", "康保", "赤城", "阳原", "万全", "下花园", "尚义", "怀安", "怀来", "蔚县", "涿鹿",
  62. "沽源",
  63. "宣化"],
  64. "承德": ["承德县", "兴隆", "宽城", "平泉", "营子", "隆化", "滦平", "围场", "丰宁", "双滦"],
  65. "廊坊": ["文安", "霸州", "大城", "廊坊开发区", "三河", "香河", "永清", "胜芳", "燕郊", "固安", "大厂"],
  66. "沧州": ["东光", "吴桥", "黄骅", "盐山", "孟村", "泊头", "献县", "南皮", "渤海新区", "海兴", "沧县", "河间",
  67. "青县",
  68. "任丘", "肃宁"],
  69. "衡水": ["景县", "阜城", "枣强", "深州", "饶阳", "故城", "武强", "武邑", "冀州", "安平"],
  70. "雄安": ["容城", "雄县", "安新"]
  71. }
  72. # 初始化组织结构映射表
  73. org_map = {}
  74. third_org_map = {}
  75. third_org_list_map = {}
  76. area_map = {}
  77. district_list_map = {}
  78. # 连接PostgreSQL数据库
  79. with psycopg.connect(
  80. conninfo=conn_info,
  81. row_factory=psycopg.rows.dict_row
  82. ) as conn:
  83. with conn.cursor() as curs:
  84. # 查询一级组织数据,并按order_num排序
  85. sql = """
  86. select * from common.organization where grade = 1 order by order_num
  87. """
  88. logger.info(f"sql: {sql}")
  89. curs.execute(sql)
  90. second_orgs = curs.fetchall()
  91. # 遍历一级组织数据,构建org_map和third_org_list_map
  92. for x in second_orgs:
  93. org_map[x['id']] = x
  94. third_org_list_map[x['id']] = []
  95. # 查询二级组织数据,并按parent_id和order_num排序
  96. sql = """
  97. select * from common.organization where grade = 2 order by parent_id, order_num
  98. """
  99. logger.info(f"sql: {sql}")
  100. curs.execute(sql)
  101. third_orgs = curs.fetchall()
  102. # 遍历二级组织数据,构建org_map、third_org_list_map和third_org_map
  103. for x in third_orgs:
  104. org_map[x['id']] = x
  105. third_org_list_map[x['parent_id']].append(x)
  106. third_org_map[x['id']] = x
  107. # 查询一级行政区划数据,并按area_id排序
  108. sql = """
  109. select * from common.area where area_grade = 1 order by area_id
  110. """
  111. logger.info(f"sql: {sql}")
  112. curs.execute(sql)
  113. cities = curs.fetchall()
  114. # 遍历一级行政区划数据,构建area_map
  115. for city in cities:
  116. area_map[city['area_id']] = city
  117. # 查询二级行政区划数据,并按parent_id和area_id排序
  118. sql = """
  119. select * from common.area where area_grade = 2 order by parent_id, area_id
  120. """
  121. logger.info(f"sql: {sql}")
  122. curs.execute(sql)
  123. districts = curs.fetchall()
  124. # 遍历二级行政区划数据,构建area_map和district_list_map
  125. for district in districts:
  126. area_map[district['area_id']] = district
  127. # 构建城市与区县的映射关系
  128. for city in cities:
  129. district_list_map[city['area_id']] = []
  130. for district in districts:
  131. if city['area_id'] == district['parent_id']:
  132. district_list_map[city['area_id']].append(district)
  133. # 读取 Excel 文件中的数据
  134. df = pd.read_excel(io=input_path)
  135. # 获取需要清理的列名列表,排除 "登记日期" 和 "进厂时间" 列
  136. columns_to_clean = list(filter(lambda x: x not in ('登记日期', '进厂时间'), df.columns))
  137. # 对需要清理的列进行字符串清理,移除多余的空白字符
  138. df[columns_to_clean] = df[columns_to_clean].map(lambda x: re.sub(r'\s+', '', x) if type(x) is str else x)
  139. df['账期'] = year_month
  140. # 保存原始单位和车牌号信息到新的列中
  141. df['原始一级单位'] = df['一级单位']
  142. df['原始二级单位'] = df['二级单位']
  143. df['原始三级单位'] = df['三级单位']
  144. df['原始车牌号'] = df['车牌号']
  145. # 定义函数,用于提取并标准化车牌号
  146. def get_che_pai(che_pai):
  147. # 如果车牌号为空或无效,则返回空字符串
  148. if pd.isna(che_pai) or not che_pai or not che_pai.strip():
  149. return ""
  150. # 将车牌号转换为大写
  151. upper_case = che_pai.upper()
  152. # 移除车牌号中不符合规则的字符
  153. s = not_che_pai_pattern.sub("", upper_case)
  154. # 使用正则表达式匹配合法的车牌号
  155. m = che_pai_pattern.search(s)
  156. if m:
  157. return m.group(0)
  158. # 如果车牌号包含省份简称但未匹配成功,记录警告日志
  159. if has_che_pai_province_pattern.search(che_pai):
  160. logger.warning(f"车牌匹配失败: {che_pai} -> {s}")
  161. return s
  162. # 如果完全无法匹配,记录警告日志并返回原车牌号
  163. logger.warning(f"车牌匹配失败: {che_pai} -> {upper_case}")
  164. return upper_case
  165. # 应用 get_che_pai 函数处理车牌号列
  166. df['车牌号'] = df['车牌号'].apply(get_che_pai)
  167. # 去重
  168. df.drop_duplicates(subset=['车牌号'], keep='last', inplace=True)
  169. # 定义函数,用于标记车牌号是否匹配失败
  170. def che_pai_fail(che_pai):
  171. # 如果车牌号为空或无效,则标记为失败
  172. if pd.isna(che_pai) or not che_pai or not che_pai.strip():
  173. return "1"
  174. # 移除车牌号中不符合规则的字符
  175. s = not_che_pai_pattern.sub("", che_pai.upper())
  176. # 使用正则表达式匹配合法的车牌号
  177. m = che_pai_pattern.search(s)
  178. if m:
  179. return "0" # 匹配成功
  180. return "1" # 匹配失败
  181. # 应用 che_pai_fail 函数生成车牌匹配失败标记列
  182. df['车牌匹配失败'] = df['车牌号'].apply(che_pai_fail)
  183. # 定义函数,用于提取一级单位
  184. def get_first_unit(unit):
  185. # 如果单位为空或无效,则返回空字符串
  186. if pd.isna(unit) or not unit or not unit.strip():
  187. return ""
  188. # 根据单位名称中的关键词返回对应的一级单位
  189. if "机动通信局" in unit or "机动局" in unit or "传输局" in unit or "线路维护中心" in unit:
  190. return "机动局"
  191. if "雄安基地建设部" in unit:
  192. return "雄安基地建设部"
  193. if "华北基地建设部" in unit:
  194. return "华北基地建设部"
  195. # 遍历 er_ji_map 的键,寻找匹配的一级单位
  196. for yj in er_ji_map.keys():
  197. if yj in unit:
  198. return yj
  199. return "省公司本部" # 默认返回省公司本部
  200. # 应用 get_first_unit 函数生成一级单位列
  201. df['一级单位'] = df['原始一级单位'].apply(get_first_unit)
  202. # 定义函数,用于提取二级单位
  203. def get_second_unit(x):
  204. # 获取一级单位和原始二级单位
  205. first_unit = str(x['一级单位']) if pd.notna(x['一级单位']) else ""
  206. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  207. # 如果二级单位为空或无效,则返回一级单位
  208. if not unit or not unit.strip():
  209. return first_unit
  210. # 如果一级单位是省公司本部,则返回省公司本部
  211. if first_unit == "省公司本部":
  212. return first_unit
  213. # 如果一级单位是机动局,则根据单位名称进一步细化
  214. if first_unit == "机动局":
  215. for yj in er_ji_map.keys():
  216. if yj in unit:
  217. return f"机动局{yj}"
  218. return "机动局本部"
  219. # 根据特定城市和关键词返回对应的二级单位
  220. if first_unit == "石家庄":
  221. if "开发区" in unit:
  222. return "石家庄开发区"
  223. if first_unit == "廊坊":
  224. if "开发区" in unit:
  225. return "廊坊开发区"
  226. if first_unit == "邢台":
  227. if "内丘" in unit:
  228. return "内邱"
  229. if "任泽" in unit:
  230. return "任县"
  231. if first_unit == "唐山":
  232. if "高开区" in unit:
  233. return "唐山高开区"
  234. if "滦州" in unit:
  235. return "滦县"
  236. # 根据 er_ji_map 获取二级单位
  237. ejs = er_ji_map.get(first_unit)
  238. if not ejs:
  239. return first_unit
  240. if first_unit == "雄安":
  241. unit = unit.replace("雄安新区", "")
  242. for ej in ejs:
  243. if ej in unit:
  244. return ej
  245. return f"{first_unit}本部" # 默认返回一级单位本部
  246. # 应用 get_second_unit 函数生成二级单位列
  247. df['二级单位'] = df.apply(get_second_unit, axis=1)
  248. # 定义函数,用于提取三级单位
  249. def get_third_unit(x):
  250. # 获取二级单位和原始三级单位
  251. second_unit = str(x['二级单位']) if pd.notna(x['二级单位']) else ""
  252. unit = str(x['原始三级单位']) if pd.notna(x['原始三级单位']) else ""
  253. # 如果三级单位为空或无效,则返回二级单位
  254. if not unit or not unit.strip():
  255. return second_unit
  256. # 按下划线分割三级单位名称
  257. a = unit.split("_")
  258. if len(a) == 1:
  259. return unit
  260. if len(a) < 4:
  261. return second_unit
  262. return a[3] # 返回分割后的第四个部分作为三级单位
  263. # 应用 get_third_unit 函数生成三级单位列
  264. df['三级单位'] = df.apply(get_third_unit, axis=1)
  265. # 定义一个函数,用于根据单位名称获取二级组织机构编码
  266. def get_area_no(unit):
  267. # 如果单位为空或无效,则返回空字符串
  268. if pd.isna(unit) or not unit or not unit.strip():
  269. return ""
  270. # 如果单位包含特定关键词(如“机动通信局”等),返回固定编码"-11"
  271. if any(keyword in unit for keyword in ["机动通信局", "机动局", "传输局", "线路维护中心"]):
  272. return "-11"
  273. # 如果单位包含特定关键词(如“省公司本部”等),返回固定编码"-12"
  274. if any(keyword in unit for keyword in ["省公司本部", "雄安基地建设部", "华北基地建设部"]):
  275. return "-12"
  276. # 遍历second_orgs列表,匹配单位名称并返回对应的id
  277. for second_org in second_orgs:
  278. if second_org.get('name') in unit:
  279. return second_org.get('id')
  280. # 如果未匹配到任何规则,返回默认编码"-12"
  281. return "-12"
  282. # 将get_area_no函数应用到DataFrame的'原始一级单位'列,生成'二级组织机构编码'列
  283. df['二级组织机构编码'] = df['原始一级单位'].apply(get_area_no)
  284. # 定义一个函数,用于根据组织机构编码获取组织机构名称
  285. def get_org_name(org_no):
  286. # 如果编码为空或无效,则返回空字符串
  287. if pd.isna(org_no) or not org_no or not org_no.strip():
  288. return ""
  289. # 在org_map中查找对应编码的组织机构信息,并返回其名称
  290. po = org_map.get(org_no)
  291. if po is not None:
  292. return po.get('name')
  293. return ""
  294. # 将get_org_name函数应用到'二级组织机构编码'列,生成'二级组织机构名称'列
  295. df['二级组织机构名称'] = df['二级组织机构编码'].apply(get_org_name)
  296. # 定义一个函数,用于根据行数据获取三级组织机构编码
  297. def get_city_no(x):
  298. # 获取相关字段值,如果为空则设置为""
  299. area_no = str(x['二级组织机构编码']) if pd.notna(x['二级组织机构编码']) else ""
  300. area_name = str(x['二级组织机构名称']) if pd.notna(x['二级组织机构名称']) else ""
  301. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  302. # 如果二级组织机构编码或名称为空,则返回""
  303. if not area_no or not area_name:
  304. return ""
  305. # 根据不同的二级组织机构名称和单位内容,返回对应的三级组织机构编码
  306. if area_name == "石家庄":
  307. if "井陉矿区" in unit:
  308. return "D0130185"
  309. if "井陉" in unit:
  310. return "D0130121"
  311. if area_name == "秦皇岛":
  312. if "北戴河新区" in unit:
  313. return "D0130325"
  314. if "北戴河" in unit:
  315. return "D0130304"
  316. if area_name == "邯郸":
  317. if "峰峰" in unit:
  318. return "D0130406"
  319. if area_name == "邢台":
  320. if "内丘" in unit:
  321. return "D0130523"
  322. if "任泽" in unit:
  323. return "D0130526"
  324. if area_name == "省机动局":
  325. if "沧州" in unit:
  326. return "HECS180"
  327. if "唐山" in unit:
  328. return "HECS181"
  329. if "秦皇岛" in unit:
  330. return "HECS182"
  331. if "廊坊" in unit:
  332. return "HECS183"
  333. if "张家口" in unit:
  334. return "HECS184"
  335. if "邢台" in unit:
  336. return "HECS185"
  337. if "邯郸" in unit:
  338. return "HECS186"
  339. if "保定" in unit:
  340. return "HECS187"
  341. if "石家庄" in unit:
  342. return "HECS188"
  343. if "承德" in unit:
  344. return "HECS189"
  345. if "衡水" in unit:
  346. return "HECS720"
  347. if "雄安" in unit:
  348. return "HECS728"
  349. return "HECS018"
  350. if area_name == "雄安":
  351. unit = unit.replace("雄安新区", "")
  352. l3 = third_org_list_map.get(area_no, [])
  353. for organization_po in l3:
  354. if organization_po.get('name') in unit:
  355. return organization_po.get('id')
  356. if area_name == "沧州":
  357. return "D0130911"
  358. if area_name == "唐山":
  359. return "D0130202"
  360. if area_name == "秦皇岛":
  361. return "D0130302"
  362. if area_name == "廊坊":
  363. return "D0131000"
  364. if area_name == "张家口":
  365. return "D0130701"
  366. if area_name == "邢台":
  367. return "D0130502"
  368. if area_name == "邯郸":
  369. return "D0130402"
  370. if area_name == "保定":
  371. return "D0130601"
  372. if area_name == "石家庄":
  373. return "D0130186"
  374. if area_name == "承德":
  375. return "D0130801"
  376. if area_name == "衡水":
  377. return "D0133001"
  378. if area_name == "雄安":
  379. return "D0130830"
  380. return "HE001"
  381. # 将get_city_no函数应用到DataFrame的每一行,生成'三级组织机构编码'列
  382. df['三级组织机构编码'] = df.apply(get_city_no, axis=1)
  383. # 将get_org_name函数应用到'三级组织机构编码'列,生成'三级组织机构名称'列
  384. df['三级组织机构名称'] = df['三级组织机构编码'].apply(get_org_name)
  385. # 定义一个函数,用于根据行数据获取二级组织机构编码2
  386. def get_area_no2(x):
  387. # 获取相关字段值,如果为空则设置为""
  388. area_name = str(x['二级组织机构名称']) if pd.notna(x['二级组织机构名称']) else ""
  389. city_name = str(x['三级组织机构名称']) if pd.notna(x['三级组织机构名称']) else ""
  390. # 如果二级组织机构名称为空,则返回""
  391. if not area_name or not area_name.strip():
  392. return ""
  393. # 根据二级组织机构名称和三级组织机构名称的内容,返回对应的编码
  394. if area_name == "省机动局" and city_name and city_name.strip():
  395. if "沧州" in city_name:
  396. return "180"
  397. if "唐山" in city_name:
  398. return "181"
  399. if "秦皇岛" in city_name:
  400. return "182"
  401. if "廊坊" in city_name:
  402. return "183"
  403. if "张家口" in city_name:
  404. return "184"
  405. if "邢台" in city_name:
  406. return "185"
  407. if "邯郸" in city_name:
  408. return "186"
  409. if "保定" in city_name:
  410. return "187"
  411. if "石家庄" in city_name:
  412. return "188"
  413. if "承德" in city_name:
  414. return "189"
  415. if "衡水" in city_name:
  416. return "720"
  417. if "雄安" in city_name:
  418. return "782"
  419. if "沧州" in area_name:
  420. return "180"
  421. if "唐山" in area_name:
  422. return "181"
  423. if "秦皇岛" in area_name:
  424. return "182"
  425. if "廊坊" in area_name:
  426. return "183"
  427. if "张家口" in area_name:
  428. return "184"
  429. if "邢台" in area_name:
  430. return "185"
  431. if "邯郸" in area_name:
  432. return "186"
  433. if "保定" in area_name:
  434. return "187"
  435. if "石家庄" in area_name:
  436. return "188"
  437. if "承德" in area_name:
  438. return "189"
  439. if "衡水" in area_name:
  440. return "720"
  441. if "雄安" in area_name:
  442. return "782"
  443. return ""
  444. # 将get_area_no2函数应用到DataFrame的每一行,生成'二级组织机构编码2'列
  445. df['二级组织机构编码2'] = df.apply(get_area_no2, axis=1)
  446. # 将get_org_name函数应用到'二级组织机构编码2'列,生成'二级组织机构名称2'列
  447. df['二级组织机构名称2'] = df['二级组织机构编码2'].apply(get_org_name)
  448. # 定义一个函数,用于根据单位名称获取城市ID
  449. def get_city_id(unit):
  450. # 如果单位为空或无效,则返回""
  451. if pd.isna(unit) or not unit or not unit.strip():
  452. return ""
  453. # 遍历cities列表,匹配单位名称并返回对应的城市ID
  454. for city in cities:
  455. if city.get('short_name') and city['short_name'] in unit:
  456. return city.get('area_id', "")
  457. return ""
  458. # 将get_city_id函数应用到'原始一级单位'列,生成'city_id'列
  459. df['city_id'] = df['原始一级单位'].apply(get_city_id)
  460. # 定义一个函数,用于根据ID获取区域名称
  461. def get_area_name(id):
  462. # 如果ID为空或无效,则返回""
  463. if pd.isna(id) or not id or not id.strip():
  464. return ""
  465. # 在area_map中查找对应ID的区域信息,并返回其名称
  466. area_po = area_map.get(id)
  467. if area_po is not None:
  468. return area_po.get("area_name", "")
  469. return ""
  470. # 将get_area_name函数应用到'city_id'列,生成'city'列
  471. df['city'] = df['city_id'].apply(get_area_name)
  472. # 定义一个函数,用于根据行数据获取区县ID
  473. def get_district_id(x):
  474. # 获取相关字段值,如果为空则设置为""
  475. city_id = str(x['city_id']) if pd.notna(x['city_id']) else ""
  476. city = str(x['city']) if pd.notna(x['city']) else ""
  477. unit = str(x['原始二级单位']) if pd.notna(x['原始二级单位']) else ""
  478. # 如果城市ID、城市名称或单位为空,则返回""
  479. if not city_id or not city or not unit:
  480. return ""
  481. # 根据城市名称和单位内容,返回对应的区县ID
  482. if city == "石家庄":
  483. if "井陉矿区" in unit:
  484. return "130107"
  485. if "井陉" in unit:
  486. return "130121"
  487. if city == "雄安":
  488. unit = unit.replace("雄安新区", "")
  489. districts = district_list_map.get(city_id)
  490. if not districts:
  491. return ""
  492. for district in districts:
  493. if district.get('short_name') in unit:
  494. return district.get('area_id')
  495. return ""
  496. # 将get_district_id函数应用到DataFrame的每一行,生成'district_id'列
  497. df['district_id'] = df.apply(get_district_id, axis=1)
  498. # 将get_area_name函数应用到'district_id'列,生成'district'列
  499. df['district'] = df['district_id'].apply(get_area_name)
  500. # 提取账期年份和月份信息
  501. df['year_no'] = df['账期'].apply(lambda x: None if pd.isna(x) else str(x)[:4])
  502. df['month_no'] = df['账期'].apply(lambda x: None if pd.isna(x) else str(x)[-2:])
  503. def to_datetime(x):
  504. try:
  505. return pd.to_datetime(x)
  506. except Exception:
  507. return None
  508. df['登记日期'] = df['登记日期'].apply(to_datetime)
  509. df['进厂时间'] = df['进厂时间'].apply(to_datetime)
  510. def get_num(x):
  511. try:
  512. return float(x)
  513. except Exception:
  514. return None
  515. df['公里数'] = df['公里数'].apply(get_num)
  516. df['截止数据提取日行驶里程'] = df['截止数据提取日行驶里程'].apply(get_num)
  517. df['超出建议保养公里数'] = df['超出建议保养公里数'].apply(get_num)
  518. def get_int(x):
  519. try:
  520. return int(x)
  521. except Exception:
  522. return ""
  523. df['超出建议保养时间(天)'] = df['超出建议保养时间(天)'].apply(get_int)
  524. # 打印DataFrame的信息
  525. print(df.info())
  526. # 将处理后的数据保存到CSV文件中
  527. df.to_csv(path_or_buf=output_path,
  528. header=['year_month', 'che_pai_hao', 'che_xing', 'first_unit', 'second_unit', 'third_unit',
  529. 'deng_ji_ri_qi', 'jin_chang_shi_jian', 'jin_chang_gong_li', 'li_cheng', 'bao_yang',
  530. 'chao_bao_tian_shu', 'chao_bao_gong_li', 'raw_yi_ji', 'raw_er_ji', 'raw_san_ji',
  531. 'raw_che_pai_hao', 'che_pai_fail', 'area_no', 'area_name', 'city_no', 'city_name', 'area_no2',
  532. 'area_name2', 'city_id', 'city', 'district_id', 'district', 'year_no', 'month_no'],
  533. index=False,
  534. encoding='utf-8-sig')
  535. def data_import():
  536. # 定义 PowerShell 脚本的路径
  537. script_path = r"../../copy.ps1"
  538. # 目标表和文件信息
  539. table = "car.car_chao_bao" # 数据库目标表名
  540. # 表字段列名,用于指定导入数据的列顺序
  541. columns = "year_month,che_pai_hao,che_xing,first_unit,second_unit,third_unit,deng_ji_ri_qi,jin_chang_shi_jian,jin_chang_gong_li,li_cheng,bao_yang,chao_bao_tian_shu,chao_bao_gong_li,raw_yi_ji,raw_er_ji,raw_san_ji,raw_che_pai_hao,che_pai_fail,area_no,area_name,city_no,city_name,area_no2,area_name2,city_id,city,district_id,district,year_no,month_no"
  542. # 构造执行 PowerShell 脚本的命令
  543. command = f"powershell -File {script_path} -db_host {db_host} -db_port {db_port} -db_username {db_username} -db_password {db_password} -dbname {dbname} -table {table} -filename {output_path} -columns {columns}"
  544. # 打印生成的命令,方便调试和日志记录
  545. logger.info("command: {}", command)
  546. # 使用 subprocess 模块运行 PowerShell 命令,并捕获输出
  547. completed_process = subprocess.run(
  548. command, # 执行的命令
  549. check=False, # 如果命令执行失败,不抛出异常
  550. text=True, # 将输出作为字符串处理
  551. capture_output=True, # 捕获标准输出和标准错误
  552. )
  553. # 打印命令执行的结果,包括返回码、标准输出和标准错误
  554. logger.info("导入结果:\n{}\n{}\n{}", completed_process.returncode, completed_process.stdout,
  555. completed_process.stderr)
  556. # 定义正则表达式,用于匹配标准输出中的 COPY 结果
  557. p = re.compile(r"^(COPY) (\d+)$")
  558. count = None # 初始化计数变量
  559. matcher = p.match(completed_process.stdout) # 匹配标准输出中的 COPY 结果
  560. if matcher:
  561. count = int(matcher.group(2)) # 提取导入的数据行数
  562. # 如果没有成功提取到导入数据的行数,抛出运行时异常
  563. if count is None:
  564. raise RuntimeError("导入数据失败")
  565. def upload_file():
  566. remote_path = f'{remote_dir_path}{year_month}.xlsx' # 定义远程主机的目标文件路径
  567. # 使用paramiko.SSHClient创建一个SSH客户端对象,并通过with语句管理其上下文
  568. with paramiko.SSHClient() as ssh:
  569. # 设置自动添加主机密钥策略,避免因未知主机密钥导致连接失败
  570. ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
  571. # 连接到远程主机,传入主机地址、端口、用户名和密码
  572. ssh.connect(ssh_hostname, port=ssh_port, username=ssh_username, password=ssh_password)
  573. # 执行远程命令,创建远程目录(如果不存在)
  574. ssh.exec_command(f'mkdir -p {remote_dir_path}')
  575. # 打开SFTP会话,用于文件传输,并通过with语句管理其上下文
  576. with ssh.open_sftp() as sftp:
  577. # 记录日志,提示即将上传的本地文件和远程目标路径
  578. logger.info("upload {} to {}", input_path, remote_path)
  579. # 使用SFTP的put方法将本地文件上传到远程主机
  580. sftp.put(input_path, remote_path)
  581. # 记录日志,提示文件已成功上传
  582. logger.info("uploaded {}", input_path)
  583. def data_update():
  584. with psycopg.connect(
  585. conninfo=conn_info,
  586. ) as conn:
  587. with conn.cursor() as curs:
  588. # 插入过检
  589. sql = f"""
  590. insert
  591. into
  592. car_theme.wz_f_severely_over_maintained_leased_vehicles_details
  593. (
  594. statistical_month,
  595. card_num,
  596. car_brand,
  597. enable_date,
  598. arrival_time,
  599. kilometers_entering_the_factory,
  600. mileage_driven_as_of_data_extraction_date,
  601. should_maintenance_be_carried_out,
  602. exceeding_the_recommended_maintenance_time,
  603. exceeding_the_recommended_maintenance_mileage,
  604. city,
  605. dpt_sec,
  606. grid
  607. )
  608. select
  609. year_month as statistical_month,
  610. che_pai_hao as card_num,
  611. che_xing as car_brand,
  612. deng_ji_ri_qi as enable_date,
  613. jin_chang_shi_jian as arrival_time,
  614. jin_chang_gong_li as kilometers_entering_the_factory,
  615. li_cheng as mileage_driven_as_of_data_extraction_date,
  616. bao_yang as should_maintenance_be_carried_out,
  617. chao_bao_tian_shu as exceeding_the_recommended_maintenance_time,
  618. chao_bao_gong_li as exceeding_the_recommended_maintenance_mileage,
  619. first_unit as city,
  620. second_unit as dpt_sec,
  621. third_unit as grid
  622. from
  623. car.car_chao_bao
  624. where
  625. year_month = {year_month}
  626. """
  627. logger.info(f"sql: {sql}")
  628. curs.execute(sql)
  629. logger.info(f"update {curs.rowcount}")
  630. data_process()
  631. data_import()
  632. upload_file()
  633. data_update()